### Importing the Prerequisities......

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import re
import time
from langchain_google_genai import ChatGoogleGenerativeAI
from itertools import combinations
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from scipy import stats
from langchain_experimental.utilities import PythonREPL
from scipy.stats import chi2_contingency,shapiro,spearmanr
from summarytools import dfSummary
import os
# os.environ['GOOGLE_API_KEY']=os.eviron

abstract_data = dict()

### Reading Data

In [2]:
data = pd.read_csv(r'data.csv')
df = data.copy()

In [3]:
df.head()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


In [372]:
abstract_data['Top few samples of data'] = df.head()

### Data Type Correction

In [29]:
patterns = {
    'date': [r'\d{4}-\d{2}-\d{2}', r'\d{2}/\d{2}/\d{4}', r'\d{2}-\d{2}-\d{4}'],
    'percentage': [r'\d+%'],
    'int':[r'\d+'],
    'price': [r'\$\d+', r'\d+\.\d{2}', r'\d+,\d+']
}

In [47]:
def correct_dtype(patterns):
    dtypes = df.dtypes.tolist()
    cols = df.columns.tolist()
    for i in range(len(cols)):
        df[cols] = df[cols].replace('',np.nan)
        if dtypes[i] == 'object':
            sam = df[cols[i]].unique()[0]
            cor,m='',0
            for dtype,pats in patterns.items():
                for pat in pats:
                    l=len(re.findall(pat,str(sam)))
                    # print(l)
                    if l!=None and l>m:
                        m=l
                        cor=dtype
            if cor == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]],errors='coerce')
            elif cor == 'percentage':
                df[cols[i]] = df[cols[i]].str.replace('%','').astype('float')
            elif cor == 'int':
                df[cols[i]] = pd.to_numeric(df[cols[i]],errors='coerce')
            elif cor == 'price':
                df[cols[i]] = df[cols[i]].str.replace('$','').replace(',','').astype('float')
                
                

In [50]:
correct_dtype(patterns)

### Treating Nan Values

In [78]:
def handle_null(impute=False):
    col_nulls = df.isnull().sum()
    nulls_50 = []
    impute_num = []
    impute_cat = []
    for col,null in col_nulls.items():
        if null==0:
            continue
        if null>df.shape[0]*0.5:
            nulls_50.append(col)
            continue
        elif impute:
            if 'int' in str(df[col].dtypes):
                impute_num.append(col)
            else:
                impute_cat.append(col)
    df.drop(nulls_50,axis=1,inplace=True)
    return nulls_50,impute_num,impute_cat 

In [81]:
def imputer():
    drop,im_num,im_cat=handle_null(True)
    
    rfr = RandomForestRegressor()
    rfc = RandomForestClassifier()
    
    X = df[df.select_dtypes(include=['int','float']).columns]
    imputer_num = IterativeImputer(estimator = rfr,max_iter=100)
    imputer_num.fit(X)
    df[df.select_dtypes(include=['int','float']).columns] = imputer_num.transform(X)
    
    for col in im_cat:
        imputer_cat = IterativeImputer(estimator = rfc)
        le = LabelEncoder()
        y = le.fit_transform(df[col])
        X = df[df.select_dtypes(include=['int','float']).columns]
        imputer_cat.fit(X,y=y)
        y = imputer_cat.transform(X)
        df[col] = le.inverse_transform(y.round().astype('int'))

In [82]:
handle_null()
imputer()

### Descriptive Analysis

In [375]:
columns = df.columns
abstract_data['Columns present in data'] = columns.tolist()

In [380]:
cov = []
col_des = []
for column in columns:
    print(f"About {column} \n{df[column].describe()}\nUnique values:{df[column].nunique()}")
    col_des.append(f"About {column} \n{df[column].describe()}\nUnique values:{df[column].nunique()}")
    try:
        print(f"Coefficient of Variation %.4f"%(df[column].std()/df[column].mean()))
        cov.append(f"Coefficient of Variation of {column} is : %.4f"%(df[column].std()/df[column].mean()))
    except:
        pass

About Transaction ID 
count    1000.000000
mean      500.500000
std       288.819436
min         1.000000
25%       250.750000
50%       500.500000
75%       750.250000
max      1000.000000
Name: Transaction ID, dtype: float64
Unique values:1000
Coefficient of Variation 0.5771
About Gender 
count       1000
unique         2
top       Female
freq         510
Name: Gender, dtype: object
Unique values:2
About Age 
count    1000.00000
mean       41.39200
std        13.68143
min        18.00000
25%        29.00000
50%        42.00000
75%        53.00000
max        64.00000
Name: Age, dtype: float64
Unique values:47
Coefficient of Variation 0.3305
About Product Category 
count         1000
unique           3
top       Clothing
freq           351
Name: Product Category, dtype: object
Unique values:3
About Quantity 
count    1000.000000
mean        2.514000
std         1.132734
min         1.000000
25%         1.000000
50%         3.000000
75%         4.000000
max         4.000000
Name: Quanti

In [382]:
abstract_data['description about each and every columns'] = col_des
abstract_data['coefficient of variations'] = cov

In [18]:
df_summary = df.copy()

In [19]:
for column in columns:
    if df[column].nunique()>15:
        df_summary.drop(column,axis=1,inplace=True)

In [21]:
summary = dfSummary(df_summary)

In [394]:
summary.to_excel('summary.xlsx')

In [395]:
sum_xlsx = pd.read_excel('summary.xlsx')

In [396]:
sum_xlsx

Unnamed: 0.1,Unnamed: 0,No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
0,0,1,<strong>Gender</strong><br>[object],1. Female<br>2. Male,510 (51.0%)<br>490 (49.0%),"<img src = ""data:image/png;base64, iVBORw0KGgo...",0<br>(0.0%)
1,1,2,<strong>Product Category</strong><br>[object],1. Clothing<br>2. Electronics<br>3. Beauty,351 (35.1%)<br>342 (34.2%)<br>307 (30.7%),"<img src = ""data:image/png;base64, iVBORw0KGgo...",0<br>(0.0%)
2,2,3,<strong>Quantity</strong><br>[int64],Mean (sd) : 2.5 (1.1)<br>min < med < max:<br>1...,4 distinct values,"<img src = ""data:image/png;base64, iVBORw0KGgo...",0<br>(0.0%)
3,3,4,<strong>Price per Unit</strong><br>[int64],Mean (sd) : 179.9 (189.7)<br>min < med < max:<...,5 distinct values,"<img src = ""data:image/png;base64, iVBORw0KGgo...",0<br>(0.0%)


In [397]:
abstract_data['Summary of the data'] = sum_xlsx
summary

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Gender [object],1. Female 2. Male,510 (51.0%) 490 (49.0%),,0 (0.0%)
2,Product Category [object],1. Clothing 2. Electronics 3. Beauty,351 (35.1%) 342 (34.2%) 307 (30.7%),,0 (0.0%)
3,Quantity [int64],Mean (sd) : 2.5 (1.1) min < med < max: 1.0 < 3.0 < 4.0 IQR (CV) : 3.0 (2.2),4 distinct values,,0 (0.0%)
4,Price per Unit [int64],Mean (sd) : 179.9 (189.7) min < med < max: 25.0 < 50.0 < 500.0 IQR (CV) : 270.0 (0.9),5 distinct values,,0 (0.0%)


In [398]:
abstract_data

{'Top few samples of data':    Transaction ID  Gender   Age Product Category  Quantity  Price per Unit  \
 0             1.0    Male  34.0           Beauty       3.0            50.0   
 1             2.0  Female  26.0         Clothing       2.0           500.0   
 2             3.0    Male  50.0      Electronics       1.0            30.0   
 3             4.0    Male  37.0         Clothing       1.0           500.0   
 4             5.0    Male  30.0           Beauty       2.0            50.0   
 
    Total Amount  
 0         150.0  
 1        1000.0  
 2          30.0  
 3         500.0  
 4         100.0  ,
 'Columns present in data': ['Transaction ID',
  'Gender',
  'Age',
  'Product Category',
  'Quantity',
  'Price per Unit',
  'Total Amount'],
 'description about each and every columns': ['About Transaction ID \ncount    1000.000000\nmean      500.500000\nstd       288.819436\nmin         1.000000\n25%       250.750000\n50%       500.500000\n75%       750.250000\nmax      1000.0

### More Descriptive Analysis

#### Chi-Square Test [CAT]

In [84]:
num_cols = df.select_dtypes(include=['int','float']).columns.tolist()
cat_cols = df.select_dtypes(exclude=['int','float']).columns.tolist()

In [85]:
def chi_square_test(A,B):
    contingency_table = pd.crosstab(A,B)
    return  chi2_contingency(contingency_table)
    

In [404]:
ALPHA = 0.05
chi = []
for prev_col in range(len(cat_cols)):
    for cur_col in range(prev_col+1,len(cat_cols)):
        chi2_stat, p_val, dof, ex = chi_square_test(df[cat_cols[prev_col]],df[cat_cols[cur_col]])
        res = f""" feature 1:{cat_cols[prev_col]} and feature 2: {cat_cols[cur_col]}
        Chi-Square Statistic: {chi2_stat}
        p-value: {p_val}
        Degrees of freedom:{dof}
        Expected frequencies:{ex}"""
        if p_val <= ALPHA:
            print(f'Rejected H0 under significance level {ALPHA} {cat_cols[prev_col]} & {cat_cols[cur_col]} are dependent.')
            res+=f'\nRejected H0 under significance level {ALPHA} {cat_cols[prev_col]} & {cat_cols[cur_col]} are dependent.'
        else:
            print(f'Fail to reject H0 due to lack of evidence under significance level {ALPHA} {cat_cols[prev_col]} & {cat_cols[cur_col]} are independent.')
            res+=f'\nFail to reject H0 due to lack of evidence under significance level {ALPHA} {cat_cols[prev_col]} & {cat_cols[cur_col]} are independent.'
        chi.append(res)

Fail to reject H0 due to lack of evidence under significance level 0.05 Gender & Product Category are independent.


In [406]:
abstract_data['chi-square contigency test for features'] = chi

#### Shapiro-Wilkinson's Normality Test [NUM]

In [408]:
ALPHA = 0.05
shap = ['Shapiro wilkinson normality test']
for column in num_cols:
    _,p = shapiro(df[column])
    if p <= ALPHA:
        print(f'''\nRejected H0 under significance level {ALPHA}\n{column} doesn't seems to be normally distributed''')
        shap.append(f'''\nRejected H0 under significance level {ALPHA}\n{column} doesn't seems to be normally distributed''')
    else:
        print(f'''\nFail to reject H0 due to lack of evidence under significance level {ALPHA}\n{column} seem to be normally distributed''')
        shap.append(f'''\nFail to reject H0 due to lack of evidence under significance level {ALPHA}\n{column} seem to be normally distributed''')


Rejected H0 under significance level 0.05
Transaction ID doesn't seems to be normally distributed

Rejected H0 under significance level 0.05
Age doesn't seems to be normally distributed

Rejected H0 under significance level 0.05
Quantity doesn't seems to be normally distributed

Rejected H0 under significance level 0.05
Price per Unit doesn't seems to be normally distributed

Rejected H0 under significance level 0.05
Total Amount doesn't seems to be normally distributed


In [409]:
abstract_data['Shapiro wilkinson normality test'] = shap

#### Spearemann's correlation test

In [410]:
corr = []
for prev_col in range(len(num_cols)):
    for cur_col in range(prev_col+1,len(num_cols)):
        corr, p = spearmanr(df[num_cols[prev_col]], df[num_cols[cur_col]])
        R = ''
        print(f'\n* `{num_cols[prev_col]}` & `{num_cols[cur_col]}`\n')
        R+=f'\n* `{num_cols[prev_col]}` & `{num_cols[cur_col]}`\n'
        print(f'corr: {round(corr, 4)} \t p: {p}')
        R+=f'corr: {round(corr, 4)} \t p: {p}'

        if p <= ALPHA:
            print(f'Rejected H0 under significance level {ALPHA}, {num_cols[prev_col]} & {num_cols[cur_col]} are correlated')
            R+=f'Rejected H0 under significance level {ALPHA}, {num_cols[prev_col]} & {num_cols[cur_col]} are correlated'
        else:
            print(f'''Fail to reject H0 due to lack of evidence under significance level {ALPHA}, 
                {num_cols[prev_col]} & {num_cols[cur_col]} are not correlated''')
            R+=f'''Fail to reject H0 due to lack of evidence under significance level {ALPHA}, 
                {num_cols[prev_col]} & {num_cols[cur_col]} are not correlated'''


* `Transaction ID` & `Age`

corr: 0.0653 	 p: 0.03892114436932943
Rejected H0 under significance level 0.05, Transaction ID & Age are correlated

* `Transaction ID` & `Quantity`

corr: -0.0268 	 p: 0.3973114959672085
Fail to reject H0 due to lack of evidence under significance level 0.05, 
                Transaction ID & Quantity are not correlated

* `Transaction ID` & `Price per Unit`

corr: -0.0671 	 p: 0.03383808628700808
Rejected H0 under significance level 0.05, Transaction ID & Price per Unit are correlated

* `Transaction ID` & `Total Amount`

corr: -0.0693 	 p: 0.028354546112173036
Rejected H0 under significance level 0.05, Transaction ID & Total Amount are correlated

* `Age` & `Quantity`

corr: -0.0235 	 p: 0.45844430718442974
Fail to reject H0 due to lack of evidence under significance level 0.05, 
                Age & Quantity are not correlated

* `Age` & `Price per Unit`

corr: -0.038 	 p: 0.23032617307313
Fail to reject H0 due to lack of evidence under significance l

In [411]:
abstract_data['Correaltions information about features'] = corr

### Diagnostic Analysis

#### Aggregations

In [415]:
diff_cat_cols = []
aggregations = dict()
for col in cat_cols:
        aggregations["aggregations  & grouping by "+col] = df.groupby(col)[cat_cols].agg({'max','min',pd.Series.mode})
        aggregations["aggregations & grouping by  "+col] = df.groupby(col)[num_cols].agg({'max','count',pd.Series.mean})
        diff_cat_cols.append(col)

In [192]:

for i in range(len(diff_cat_cols)-1):
    for j in range(i+1,len(diff_cat_cols)):
        grp_data = df.groupby([diff_cat_cols[i],diff_cat_cols[j]]).size().unstack().fillna(0)
        aggregations['data_aggregations'+diff_cat_cols[i]+diff_cat_cols[j]] = grp_data


In [417]:
abstract_data['Grouping among features'] = aggregations

#### Anomolies

In [184]:
data_num = pd.get_dummies(df,dtype='float')
data_num.dropna(inplace=True)
mean_vector = np.mean(data_num, axis=0)  
cov_matrix = np.cov(data_num, rowvar=False) 

inv_cov_matrix = np.linalg.inv(cov_matrix+0.000001) 

def mahalanobis(x):
    centered_x = x - mean_vector  # (11,)
    mahalanobis_dist = np.dot(centered_x, np.dot(inv_cov_matrix, centered_x.T))  # (1,)
    return mahalanobis_dist

In [185]:
data_num['mahalanobis'] = data_num.apply(lambda row: mahalanobis(row), axis=1)

In [186]:
out_data = df.copy()
out_data.dropna(inplace=True)
out_data['outlier'] = [1 if i>(data_num['mahalanobis'].mean())*3  else 0 for i in data_num['mahalanobis']]

In [187]:
outliers = out_data[out_data['outlier']==1]

In [419]:
abstract_data['Outliers points present in data'] = outliers.to_dict()

### More Analysis

#### Segmentation

In [194]:
# inertia = []
# si_score = []
# wc_score = []
# for i in range(2,11):
#     km = KMeans(n_clusters=i)
#     km.fit(data[['Annual Income (k$)','Spending Score (1-100)']]) 
#     inertia.append(km.inertia_)
#     si_score.append(silhouette_score(data[['Annual Income (k$)','Spending Score (1-100)']],km.labels_))
#     wc_score.append(davies_bouldin_score(data[['Annual Income (k$)','Spending Score (1-100)']],km.labels_))

In [210]:
cluster_df = df.copy()

In [211]:
LE = dict()
for col in cat_cols:
    le = LabelEncoder()
    cluster_df[col] = le.fit_transform(cluster_df[col])
    LE[col]=le

In [228]:

def select_best_features_and_clusters(df, min_clusters=2, max_clusters=10, top_n=5):
    # Scale the data only once
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    labels = None
    best_score = -1
    best_features = None
    best_n_clusters = None
    column_names = df.columns

    # Iterate over different numbers of clusters
    for n_clusters in range(min_clusters, max_clusters + 1):
        # Iterate over all combinations of features
        for n_features in range(1, top_n + 1):
            for feature_subset in combinations(range(df.shape[1]), n_features):
                if len(feature_subset)<=2:
                    continue
                # Select feature subset from scaled data
                subset_data = scaled_data[:, feature_subset]

                # Apply KMeans clustering
                kmeans = KMeans(n_clusters=n_clusters, random_state=42)
                cluster_labels = kmeans.fit_predict(subset_data)

                # Compute silhouette score
                score = silhouette_score(subset_data, cluster_labels)
                
                # Keep track of the best score, features, and number of clusters
                if score > best_score :
                    best_score = score
                    labels = cluster_labels
                    best_features = feature_subset
                    best_n_clusters = n_clusters

    # Get the selected feature names
    selected_feature_names = [column_names[i] for i in best_features]
    
    print(f"Best Number of Clusters: {best_n_clusters}")
    print(f"Best Feature Set: {selected_feature_names} with Silhouette Score: {best_score}")
    # print(f"Best labels: {labels}")
    # Return the DataFrame with selected features and the best number of clusters
    return df[selected_feature_names], best_n_clusters,labels

# Example usage
# best_features_silhouette, best_n_clusters,labels = select_best_features_and_clusters(cluster_df, min_clusters=2, max_clusters=5, top_n=6)


Best Number of Clusters: 5
Best Feature Set: ['Gender', 'Price per Unit', 'Total Amount'] with Silhouette Score: 0.7255583888772646
Best labels: [0 1 0 4 0 2 0 0 4 2 0 0 3 0 1 3 2 2 2 4 1 0 2 2 2 1 2 1 2 1 4 0 2 2 1 4 2
 0 0 0 0 4 2 2 2 1 1 4 1 2 0 2 0 1 0 1 2 4 0 0 0 0 0 0 3 2 1 4 2 2 2 1 0 1
 0 2 2 1 4 2 0 2 0 2 0 0 2 4 1 2 1 2 1 1 2 1 1 2 1 0 4 2 2 1 1 2 1 2 1 4 1
 3 2 2 3 2 3 1 2 0 2 0 2 3 0 2 2 4 1 1 1 0 4 0 0 4 3 0 3 0 2 4 2 1 2 0 4 0
 0 2 0 3 3 4 3 2 3 1 0 2 3 0 2 1 1 3 2 4 3 2 1 0 0 2 2 2 0 0 4 4 4 0 1 0 0
 0 2 0 0 2 0 0 3 0 0 1 2 1 3 0 0 1 3 0 2 0 2 2 2 0 3 3 3 0 3 0 2 0 2 4 4 0
 2 2 2 2 0 2 0 0 2 2 1 2 1 2 1 1 3 2 2 0 1 0 0 2 0 4 0 0 2 4 1 4 0 3 3 2 2
 0 0 2 0 4 4 2 2 2 3 4 2 2 2 1 3 2 0 2 4 1 1 2 1 0 2 0 0 0 0 1 4 4 0 2 1 1
 1 4 3 2 0 4 0 2 2 0 2 1 2 2 2 0 1 0 0 2 0 0 4 1 2 4 1 2 2 2 0 0 2 2 0 4 1
 4 2 2 4 0 2 1 0 1 0 2 0 3 0 1 2 0 2 3 4 2 1 3 2 2 0 0 1 0 0 1 4 0 2 1 3 0
 2 1 1 2 0 2 2 4 2 4 2 1 2 4 3 1 0 0 0 0 0 4 1 1 3 2 2 1 2 0 2 1 4 3 1 2 1
 1 1 2 0 1 2 0 0 3 4 1 2 1 1 2

In [229]:
# cluster_df['label']=labels

In [420]:
# abstract_data['clusters'] = cluster_df.groupby('label')[cat_cols].aggregate(pd.Series.mode).to_dict()

In [421]:
# abstract_data['interpretaions among cluster segments'] = cluster_df.groupby('label')[num_cols].aggregate('mean')

#### Forecasting

In [337]:
def code_executor(prompt):
    head = df.head(6)
    model =  ChatGoogleGenerativeAI(
        model="gemini-pro", 
        safety_settings=None
    )
    result = 'Error'
    while 'Error' in result:
        response = model.invoke(prompt)
        # print(response.text)    
        time.sleep(3)
        print(response)
        res = response.content
        right = res.rfind('```')
        left = res.find('```')
        code = res[left+10:right]
        repl = PythonREPL()
        result = repl.run(code)
        # print(code)
        print(result)
        prompt=f"by running your code {code} please correct the code please solve the error : {result} for   "+prompt
       
    return result

In [367]:
head = df.head()
prompt = f"""Generate a Python code for Time series forecasting for  the retail data[data.csv] by pmd autoarima forecast next month sales and save the results has columns = {columns} and some sample of the data {head} 
if the data is not suitable for time series analysis such that if it doesn't has data column do some other forecasting without date - if it is not suitable for forescasting just print ( False or 0)
save the plot in (time_series.png)
and forecast data in (forecast.csv) 
dont use exit() keyword instead use if else
if possible for forecasting return the forecasted data"""

In [331]:
try:
    result = code_executor("file location: data.csv"+prompt)
except:
    result = 'hi\n'
    pass

content="```python\nimport pandas as pd\nimport numpy as np\nfrom statsmodels.tsa.statespace. sarimax import SARIMAX\nfrom pmdarima import auto_arima\n\n# Read the data\ndf = pd.read_csv('Mall_Customers.csv')\n\n# Check if the data is suitable for time series analysis\nif 'Date' not in df.columns:\n    print('False')\n    exit()\n\n# Set the index to the Date column\ndf.set_index('Date', inplace=True)\n\n# Create a time series plot\ndf['Total Amount'].plot()\nplt.xlabel('Date')\nplt.ylabel('Total Amount')\nplt.title('Time Series Plot of Total Amount')\nplt.savefig('time_series.png')\n\n# Forecast next month sales using pmd autoarima\nmodel = auto_arima(df['Total Amount'], seasonal=True)\nforecast = model.predict(n_periods=30)\n\n# Save the forecast data\nforecast.to_csv('forecast.csv')\n```" additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'pr

In [326]:
result.strip()

'False'

In [422]:
if result.strip()==False:
    abstract_data['forecasting'] = 'not possible'
else:
    abstract_data['forecasting'] = result

#### Moree

In [425]:
def aprori():
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns

    df_encoded = pd.get_dummies(df[categorical_columns], drop_first=True)

    df_encoded.reset_index(drop=True, inplace=True)

    frequent_itemsets = apriori(df_encoded, min_support=0.05, use_colnames=True)

    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

    print(rules)
    return rules

    rules.to_csv('association_rules_output.csv', index=False)
    

In [426]:
abstract_data['association analysis by apriori algorithm'] = aprori()

                      antecedents                     consequents  \
0                   (Gender_Male)     (Product Category_Clothing)   
1     (Product Category_Clothing)                   (Gender_Male)   
2                   (Gender_Male)  (Product Category_Electronics)   
3  (Product Category_Electronics)                   (Gender_Male)   

   antecedent support  consequent support  support  confidence      lift  \
0               0.490               0.351    0.177    0.361224  1.029130   
1               0.351               0.490    0.177    0.504274  1.029130   
2               0.490               0.342    0.172    0.351020  1.026375   
3               0.342               0.490    0.172    0.502924  1.026375   

   leverage  conviction  zhangs_metric  
0   0.00501    1.016006       0.055500  
1   0.00501    1.028793       0.043613  
2   0.00442    1.013899       0.050388  
3   0.00442    1.026000       0.039054  


In [427]:
abstract_data

{'Top few samples of data':    Transaction ID  Gender   Age Product Category  Quantity  Price per Unit  \
 0             1.0    Male  34.0           Beauty       3.0            50.0   
 1             2.0  Female  26.0         Clothing       2.0           500.0   
 2             3.0    Male  50.0      Electronics       1.0            30.0   
 3             4.0    Male  37.0         Clothing       1.0           500.0   
 4             5.0    Male  30.0           Beauty       2.0            50.0   
 
    Total Amount  
 0         150.0  
 1        1000.0  
 2          30.0  
 3         500.0  
 4         100.0  ,
 'Columns present in data': ['Transaction ID',
  'Gender',
  'Age',
  'Product Category',
  'Quantity',
  'Price per Unit',
  'Total Amount'],
 'description about each and every columns': ['About Transaction ID \ncount    1000.000000\nmean      500.500000\nstd       288.819436\nmin         1.000000\n25%       250.750000\n50%       500.500000\n75%       750.250000\nmax      1000.0

### Saving 

In [428]:
def convert_dict_to_text(data, filename='data_results.txt'):
    with open(filename, 'w') as file:
        for key, value in data.items():
            file.write(f"{key}:\n")
            if isinstance(value, list):
                for item in value:
                    file.write(f"  - {item}\n")
            elif isinstance(value, pd.DataFrame):
                file.write(value.to_string(index=False))
            elif isinstance(value, str):
                for line in value.split('\n'):
                    file.write(f"  {line}\n")
            else:
                file.write(f"  {value}\n")


convert_dict_to_text(abstract_data)

In [8]:
import os
from pandasai import SmartDataframe
from pandasai.llm import GoogleGemini
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

llm = ChatOpenAI(model = 'gpt-4o-mini')
# 
# llm = GoogleGemini(api_key='AIzaSyAhUeySoblg5TFG37IHXuz_6mruby5GwvQ',temperature=0.4)

file_path = r"data.csv"
df = pd.read_csv(file_path)
# sm = SmartDataframe(file_path,config={'llm':llm,"custom_whitelisted_dependencies": ["scikit-learn"]})

# response = sm.chat('give me the product distribution among sales')
agent_executor = create_pandas_dataframe_agent(
    llm,
    df,
    agent_type='tool-calling',
    verbose=True,
    allow_dangerous_code=True
)


In [5]:
from pprint import pprint

In [6]:
pprint(agent_executor.invoke('display previous purchases data')['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df['Previous Purchases']"}`


[0m[36;1m[1;3m0       14
1        2
2       23
3       49
4       31
        ..
3895    32
3896    41
3897    24
3898    24
3899    33
Name: Previous Purchases, Length: 3900, dtype: int64[0m[32;1m[1;3mThe data for "Previous Purchases" in the dataframe is as follows:

```
0       14
1        2
2       23
3       49
4       31
...
3895    32
3896    41
3897    24
3898    24
3899    33
```

This represents the number of previous purchases for each customer in the dataframe. The length of the data series is 3900 rows.[0m

[1m> Finished chain.[0m
('The data for "Previous Purchases" in the dataframe is as follows:\n'
 '\n'
 '```\n'
 '0       14\n'
 '1        2\n'
 '2       23\n'
 '3       49\n'
 '4       31\n'
 '...\n'
 '3895    32\n'
 '3896    41\n'
 '3897    24\n'
 '3898    24\n'
 '3899    33\n'
 '```\n'
 '\n'
 'This represents the number of prev

In [3]:
agent_executor.invoke('dispaly the feature names')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df.columns.tolist()'}`


[0m[36;1m[1;3m['Unnamed: 0', 'Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category', 'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season', 'Review Rating', 'Subscription Status', 'Payment Method', 'Shipping Type', 'Discount Applied', 'Promo Code Used', 'Previous Purchases', 'Preferred Payment Method', 'Frequency of Purchases'][0m[32;1m[1;3mThe feature names in the dataframe are:

1. Unnamed: 0
2. Customer ID
3. Age
4. Gender
5. Item Purchased
6. Category
7. Purchase Amount (USD)
8. Location
9. Size
10. Color
11. Season
12. Review Rating
13. Subscription Status
14. Payment Method
15. Shipping Type
16. Discount Applied
17. Promo Code Used
18. Previous Purchases
19. Preferred Payment Method
20. Frequency of Purchases[0m

[1m> Finished chain.[0m


{'input': 'dispaly the feature names',
 'output': 'The feature names in the dataframe are:\n\n1. Unnamed: 0\n2. Customer ID\n3. Age\n4. Gender\n5. Item Purchased\n6. Category\n7. Purchase Amount (USD)\n8. Location\n9. Size\n10. Color\n11. Season\n12. Review Rating\n13. Subscription Status\n14. Payment Method\n15. Shipping Type\n16. Discount Applied\n17. Promo Code Used\n18. Previous Purchases\n19. Preferred Payment Method\n20. Frequency of Purchases'}

In [10]:
pprint(agent_executor.invoke('tells about the distribution of products')['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df['Category'].value_counts()"}`


[0m[36;1m[1;3mCategory
Clothing       1737
Accessories    1240
Footwear        599
Outerwear       324
Name: count, dtype: int64[0m[32;1m[1;3mThe distribution of products in the dataframe is as follows:

- **Clothing**: 1737 products
- **Accessories**: 1240 products
- **Footwear**: 599 products
- **Outerwear**: 324 products

Clothing is the most common product category, followed by Accessories, Footwear, and Outerwear.[0m

[1m> Finished chain.[0m
('The distribution of products in the dataframe is as follows:\n'
 '\n'
 '- **Clothing**: 1737 products\n'
 '- **Accessories**: 1240 products\n'
 '- **Footwear**: 599 products\n'
 '- **Outerwear**: 324 products\n'
 '\n'
 'Clothing is the most common product category, followed by Accessories, '
 'Footwear, and Outerwear.')


In [11]:
agent_executor.invoke('which product did male user bought mostly')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "male_product_counts = df[df['Gender'] == 'Male']['Item Purchased'].value_counts()\nmale_most_purchased_product = male_product_counts.idxmax()\nmale_most_purchased_product"}`


[0m[36;1m[1;3mPants[0m[32;1m[1;3mThe product that male users bought the most is "Pants."[0m

[1m> Finished chain.[0m


{'input': 'which product did male user bought mostly',
 'output': 'The product that male users bought the most is "Pants."'}

In [12]:
agent_executor.invoke('which customer contributes more profit to the business')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df[['Customer ID', 'Purchase Amount (USD)']].groupby('Customer ID').sum().sort_values('Purchase Amount (USD)', ascending=False).head(1)"}`


[0m[36;1m[1;3m             Purchase Amount (USD)
Customer ID                       
2843                           100[0m[32;1m[1;3mThe customer who contributes the most profit to the business is Customer ID 2843, with a total purchase amount of 100 USD.[0m

[1m> Finished chain.[0m


{'input': 'which customer contributes more profit to the business',
 'output': 'The customer who contributes the most profit to the business is Customer ID 2843, with a total purchase amount of 100 USD.'}

In [13]:
from lightning_rag import rag

In [14]:
chain = rag()

In [19]:
from langchain_core.messages import AIMessage, HumanMessage

In [39]:
chat_history = []
question = 'analyse the data'
response = chain.invoke({'input':question, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)
print(response['answer'])

Based on the provided context, it appears that the data includes a summary of the 'gender' category. 

End to end analysis: Unfortunately, the data provided is incomplete, so I'll have to perform an end-to-end analysis to provide more insights.

To proceed, I'll make some assumptions and perform a hypothetical analysis. 

If we were to assume a typical dataset with gender information, we might expect to see a summary of the counts for each gender category. 

For example, if the dataset had around 1000 entries, we might see a summary like this:

- Male: 550 (55%)
- Female: 450 (45%)

However, without more information, it's impossible to determine the actual distribution of the data. 

If you provide more data, I can perform a more accurate analysis.


In [23]:
from pandasai import SmartDataframe

In [24]:
sm = SmartDataframe(file_path,config={'llm':llm,"custom_whitelisted_dependencies": ["scikit-learn"]})

In [27]:
sm.chat("compare and contrast the sales of pant between women and men")

'c:/Users/navab/Downloads/sem_V_test/exports/charts/temp_chart.png'

In [28]:
agent_executor.invoke("compare and contrast the sales of pant between women and men")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "pant_sales = df[df['Item Purchased'] == 'Pants']\n\n# Group by Gender and calculate total sales and average purchase amount\ncomparison = pant_sales.groupby('Gender')['Purchase Amount (USD)'].agg(['sum', 'mean'])\ncomparison"}`


[0m[36;1m[1;3m         sum       mean
Gender                 
Female  2936  61.166667
Male    7154  58.162602[0m[32;1m[1;3mHere's the comparison of pant sales between men and women based on the provided data:

- **Total Sales**:
  - **Female**: $2,936
  - **Male**: $7,154

- **Average Purchase Amount**:
  - **Female**: $61.17
  - **Male**: $58.16

### Summary:
- Total sales of pants are significantly higher for men compared to women.
- However, the average purchase amount for females is slightly higher than that for males. 

This indicates that while men purchase more pants overall, women tend to spend a bit more per purchase on average.[0m

[1m> 

{'input': 'compare and contrast the sales of pant between women and men',
 'output': "Here's the comparison of pant sales between men and women based on the provided data:\n\n- **Total Sales**:\n  - **Female**: $2,936\n  - **Male**: $7,154\n\n- **Average Purchase Amount**:\n  - **Female**: $61.17\n  - **Male**: $58.16\n\n### Summary:\n- Total sales of pants are significantly higher for men compared to women.\n- However, the average purchase amount for females is slightly higher than that for males. \n\nThis indicates that while men purchase more pants overall, women tend to spend a bit more per purchase on average."}

In [29]:
agent_executor.invoke('which product have to be removed from sales')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\n\n# Assuming df is already defined and is our dataframe\n\n# Group by 'Item Purchased' and calculate average review rating and total purchase amounts\nproduct_analysis = df.groupby('Item Purchased').agg({\n    'Review Rating': 'mean',\n    'Purchase Amount (USD)': 'sum',\n    'Frequency of Purchases': 'count'\n}).reset_index()\n\n# Rename columns for clarity\nproduct_analysis.columns = ['Item Purchased', 'Average Review Rating', 'Total Purchase Amount (USD)', 'Total Purchases']\n\n# Filter products with low average ratings and low total purchase amounts\n# Setting thresholds: Average rating below 3.0 and total purchases below a certain amount\nthreshold_rating = 3.0\nthreshold_amount = 100  # You can adjust this threshold as needed\n\nlow_performing_products = product_analysis[(product_analysis['Average Review Rating'] < threshold_rating) | \n                 

{'input': 'which product have to be removed from sales',
 'output': "It appears that there are no products with an average review rating below 3.0 or a total purchase amount below the specified threshold of 100 USD, based on the current data in the dataframe.\n\nSince we found no low-performing products, it may indicate that all items are performing reasonably well. However, if you'd like to explore other criteria for product removal, such as specific items with low ratings or infrequent purchases, please let me know!"}

In [30]:
agent_executor.invoke('give me some inventry suggestions about the products in data')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo provide inventory suggestions based on the data in the dataframe, we can analyze the following factors:

1. **Sales Performance**: Identify best-selling products by analyzing the `Purchase Amount (USD)` and the frequency of purchases.

2. **Customer Preferences**: Look at the `Size`, `Color`, and `Category` to understand what type of products are most preferred by customers.

3. **Seasonality**: Analyze the `Season` to determine which items should be stocked in anticipation of seasonal demand.

4. **Discount Potential**: Explore the `Discount Applied` and `Promo Code Used` columns to understand which products may benefit from targeted promotions to boost sales.

Here are some specific inventory suggestions:

### Inventory Suggestions

1. **Increase Stock of Best-Sellers**: 
   - Identify the items with the highest `Purchase Amount (USD)` and ensure stock levels are sufficient to meet demand.

2. **Optimize Size and Color O

{'input': 'give me some inventry suggestions about the products in data',
 'output': "To provide inventory suggestions based on the data in the dataframe, we can analyze the following factors:\n\n1. **Sales Performance**: Identify best-selling products by analyzing the `Purchase Amount (USD)` and the frequency of purchases.\n\n2. **Customer Preferences**: Look at the `Size`, `Color`, and `Category` to understand what type of products are most preferred by customers.\n\n3. **Seasonality**: Analyze the `Season` to determine which items should be stocked in anticipation of seasonal demand.\n\n4. **Discount Potential**: Explore the `Discount Applied` and `Promo Code Used` columns to understand which products may benefit from targeted promotions to boost sales.\n\nHere are some specific inventory suggestions:\n\n### Inventory Suggestions\n\n1. **Increase Stock of Best-Sellers**: \n   - Identify the items with the highest `Purchase Amount (USD)` and ensure stock levels are sufficient to meet

In [35]:
agent_executor.invoke('give me top 5 eda questions to be analysed in data')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mHere are five exploratory data analysis (EDA) questions you can consider analyzing from the given dataframe:

1. **Customer Demographics Analysis:**
   - What is the distribution of customers by age and gender? Are there significant differences in purchase behavior between different age groups and genders?

2. **Purchase Behavior Analysis:**
   - What are the most commonly purchased items and their corresponding categories? How does the purchase amount vary across different categories (Clothing vs. Footwear)?

3. **Location-Based Insights:**
   - Which locations have the highest average purchase amounts? Are there any trends or patterns in spending that can be identified based on geographic location?

4. **Seasonal Trends:**
   - How does the purchase amount vary across different seasons? Is there a certain season where purchases tend to be higher, and does this differ for different product categories?

5. **Payment and Shipp

{'input': 'give me top 5 eda questions to be analysed in data',
 'output': 'Here are five exploratory data analysis (EDA) questions you can consider analyzing from the given dataframe:\n\n1. **Customer Demographics Analysis:**\n   - What is the distribution of customers by age and gender? Are there significant differences in purchase behavior between different age groups and genders?\n\n2. **Purchase Behavior Analysis:**\n   - What are the most commonly purchased items and their corresponding categories? How does the purchase amount vary across different categories (Clothing vs. Footwear)?\n\n3. **Location-Based Insights:**\n   - Which locations have the highest average purchase amounts? Are there any trends or patterns in spending that can be identified based on geographic location?\n\n4. **Seasonal Trends:**\n   - How does the purchase amount vary across different seasons? Is there a certain season where purchases tend to be higher, and does this differ for different product categori

In [38]:
sm.chat('What are the most commonly purchased items and their corresponding categories? How does the purchase amount vary across different categories (Clothing vs. Footwear)?')

Unnamed: 0,Most Common Items,Frequency
0,Blouse,171
1,Jewelry,171
2,Pants,171
3,Shirt,169
4,Dress,166
5,Sweater,164
6,Jacket,163
7,Belt,161
8,Sunglasses,161
9,Coat,161


In [42]:
from lightning_rag import rag

In [43]:
ch = rag()

In [3]:
import os

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_community.document_loaders import JSONLoader,CSVLoader
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS

# def rag():



In [5]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004",google_api_key='AIzaSyAhUeySoblg5TFG37IHXuz_6mruby5GwvQ')

In [6]:
llm = init_chat_model("llama3-8b-8192", model_provider="groq")

# embeddings = HuggingFaceEmbeddings(
#     model_name="mixedbread-ai/mxbai-embed-large-v1",
#     )   

loader = CSVLoader(r"data_results.txt")
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)

# vector_store = InMemoryVectorStore(embeddings)
# _ = vector_store.add_documents(documents)
vector_store = FAISS.from_documents(documents,embeddings)
retreiver = vector_store.as_retriever()


In [14]:

system = (""" 
You are a data scientist tasked with analyzing the provided dataset. Your goal is to extract meaningful insights, identify patterns, and generate a comprehensive analytical report. Follow the structure below to ensure the analysis is thorough and actionable:

1. **Data Understanding**  
   - Describe the dataset (e.g., size, features, data types, missing values, etc.).  
   - Highlight any initial observations or challenges.  

2. **What Was in the Data?**  
   - Summarize the key attributes, trends, and distributions in the dataset.  
   - Identify any notable patterns, outliers, or anomalies.  

3. **How and Why It Happened**  
   - Explain the underlying reasons for the observed trends or patterns.  
   - Provide context or hypotheses for why certain phenomena occurred.  

4. **Beautiful Insights**  
   - Share unique, non-obvious, or visually appealing insights derived from the data.  
   - Use visualizations or examples to support your findings.  

5. **Takeaways**  
   - Summarize the most important findings from the analysis.  
   - Highlight actionable insights or conclusions.  

6. **Suggestions**  
   - Provide recommendations based on the analysis (e.g., improvements, next steps, etc.).  

7. **Future Perspectives**  
   - Discuss potential future trends or areas for further investigation.  
   - Suggest how the data can be leveraged for long-term value.  

Ensure the report is concise, well-structured, and focused on delivering value. Use clear language and avoid unnecessary jargon. """)
human = """
Analyze the provided dataset and generate a precise analytical report based on the following:  

- **Question**: {question}  
- **Context**: {context}  

Provide a detailed response following the structure outlined in the system prompt. Focus on extracting actionable insights and making the data valuable."""
prompt = ChatPromptTemplate([
    ("system", system),
    ("human", human),
])

chain = prompt |  llm


In [15]:
# print(chain.invoke({'question':'hi'}).content)
def run(question):
    contents = retreiver.invoke(question)
    return chain.invoke({'question':question,'context':contents})

In [17]:
pprint(run('analyse the data').content)

('**Data Understanding**\n'
 '\n'
 'The provided dataset, "Customer Shopping Data," contains information on '
 'customer shopping trends. The dataset is composed of numerical and '
 'categorical features, with various data types, including integers, floats, '
 'and strings. After reviewing the provided summary, I noticed that the '
 'dataset has 410 rows, but the row number in the metadata hints at a larger '
 'dataset. Additionally, there is no information on missing values, which '
 'might be an important aspect to consider in the analysis.\n'
 '\n'
 '**What Was in the Data?**\n'
 '\n'
 'The provided summary does not explicitly mention the features in the '
 'dataset. However, based on the context, I assume that the dataset includes '
 'variables such as customer demographics, shopping behavior, and transaction '
 'history. Further analysis is required to understand the actual features and '
 'their distributions.\n'
 '\n'
 'The summary mentions that statistical tests have been appli

In [16]:
from pprint import pprint

In [26]:
from lightning_rag import rag

In [27]:
rag_chain = rag()

In [28]:
prompt = """Do extract and intrepret the abstracted data and provide a super analytical report of the data which can help the client/user to take some immediate decision from your report"""
result = rag_chain.invoke({"input":prompt})

In [29]:
result

{'input': 'Do extract and intrepret the abstracted data and provide a super analytical report of the data which can help the client/user to take some immediate decision from your report',
 'context': [Document(metadata={'source': 'data_results.txt', 'row': 0}, page_content="meta:: The abstracted portion of the data analysis is included here to be further analyzed by LLMs to generate reports. Quick analysis of this data is crucial for faster decision-making on the client's side. This summary includes descriptions of data\nNone: feature types,feature correlations,statistical tests on univariate features,facts and cubes,and visualizations."),
  Document(metadata={'source': 'data_results.txt', 'row': 36}, page_content='meta:: Aggregated data summaries\nNone: including key metrics and dimensions used for analyzing patterns and trends. Facts and cubes provide a multi-dimensional view of the data,facilitating in-depth analysis and insights.'),
  Document(metadata={'source': 'data_results.txt'

In [9]:
from lightning_rag import rag


In [10]:
chain = rag()

In [11]:
prompt = """Do extract and intrepret the abstracted data and provide a super analytical report of the data which can help the client/user to take some immediate decision from your report"""
result = chain.invoke({"input":prompt})

In [6]:
chain.invoke({"input":"which team did score most"})

{'input': 'which team did score most',
 'context': [Document(metadata={'source': 'data_results.txt', 'row': 61}, page_content='summary:: Pakistan    Netherlands   Hyderabad    Netherlands   Ball             294                10                10       Pakistan                    81R        1'),
  Document(metadata={'source': 'data_results.txt', 'row': 66}, page_content='summary:: Netherlands   South Africa Dharamshala   South Africa   Ball             258                 8                10    Netherlands                    38R        1'),
  Document(metadata={'source': 'data_results.txt', 'row': 67}, page_content='summary:: New Zealand    Afghanistan     Chennai    Afghanistan   Ball             300                 6                10    New Zealand                   149R        1')],
 'answer': 'Based on the data, New Zealand scored the most with 300 runs.'}

In [12]:
result

{'input': 'Do extract and intrepret the abstracted data and provide a super analytical report of the data which can help the client/user to take some immediate decision from your report',
 'context': [Document(metadata={'source': 'data_results.txt', 'row': 34}, page_content='summary:: Aggregated data summaries\nNone: including key metrics and dimensions used for analyzing patterns and trends. Facts and cubes provide a multi-dimensional view of the data,facilitating in-depth analysis and insights.'),
  Document(metadata={'source': 'data_results.txt', 'row': 78}, page_content='summary:: it contains the data pints which are outliers please intrepret the those points in useful analysis'),
  Document(metadata={'source': 'data_results.txt', 'row': 41}, page_content='summary:: Stacked bar charts visualizing the distribution of categorical data across different segments. These visualizations help in understanding the composition and relative proportions of different categories.')],
 'answer': 

In [8]:
chain.invoke({'input':'what is the data is all about'})

{'input': 'what is the data is all about',
 'context': [Document(metadata={'source': 'data_results.txt', 'row': 3}, page_content='summary:: A summary of the dataset\nNone: including its source,context of data collection,the period covered,and the main variables included. This overview provides the structure and content of the data.'),
  Document(metadata={'source': 'data_results.txt', 'row': 34}, page_content='summary:: Aggregated data summaries\nNone: including key metrics and dimensions used for analyzing patterns and trends. Facts and cubes provide a multi-dimensional view of the data,facilitating in-depth analysis and insights.'),
  Document(metadata={'source': 'data_results.txt', 'row': 30}, page_content='summary:: Data Facts and Cubes:')],
 'answer': 'Based on the provided context, it appears that the dataset is likely related to some form of business or market analysis. The mention of "facts and cubes" and "multi-dimensional view" suggests that the data is being analyzed from va

In [1]:
from analytic_summary import summary

In [3]:
sum = summary()

In [4]:
sum.report()

<coroutine object summary.report at 0x0000020B2E0DEB00>

In [10]:
from analytic_summary import SummaryGenerator

In [11]:
import asyncio

In [13]:
summary_generator = SummaryGenerator()
summary_generator.load_data()
summary_generator.chains()
summary_generator.build_graph()
summary_generator.generate()

<coroutine object SummaryGenerator.generate at 0x000001B1533CA8C0>