In [None]:
import pandas as pd
import sys
import numpy as np
sys.path.append('../src')
from functions import *
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *

# Loading the Data:

In [None]:
df_clients_profile = pd.read_csv('../Data/Raw/df_final_demo.txt')
df_web_data_1 = pd.read_csv('../Data/Raw/df_final_web_data_pt_1.txt')
df_web_data_2 = pd.read_csv('../Data/Raw/df_final_web_data_pt_2.txt')
df_experiment_clients = pd.read_csv('../Data/Raw/df_final_experiment_clients.txt')
pd.set_option('display.max_columns', None) # To display all columns
df_web_data = pd.concat([df_web_data_1, df_web_data_2], ignore_index= True)

# Exploratory Data Analysis (EDA):

## Initial Exploration

In [None]:
df_clients_profile.head(10)

In [None]:
df_clients_profile.shape

In [None]:
df_experiment_clients.head(10)

In [None]:
df_experiment_clients.shape

In [None]:
df_web_data.head(10)

In [None]:
df_web_data.shape

### Exploring numerical and categorical variables

In [None]:
# Retrieving the unique data types present in the dataframe columns
df_clients_profile.info()

In [None]:
df_experiment_clients.info()

In [None]:
df_web_data.info()

In [None]:
# Extracting column names with numerical data types from the dataframe
numerical_columns_profile = df_clients_profile.select_dtypes(include=[np.number]).columns
print(numerical_columns_profile)

In [None]:
# Extracting column names with numerical data types from the dataframe
numerical_columns_web = df_web_data.select_dtypes(include=[np.number]).columns
print(numerical_columns_web)

In [None]:
# Extracting column names with numerical data types from the dataframe
numerical_columns_exp_cli = df_web_data.select_dtypes(include=[np.number]).columns
print(numerical_columns_exp_cli)

In [None]:
# Separating between discrete and continuous variables, as discrete ones could potentially be treated as categorical.
# Remember to adjust the threshold (in this case, < 20) based on your dataset's specific characteristics and domain knowledge.
potential_categorical_from_numerical_profile = df_clients_profile.select_dtypes("number").loc[:, df_clients_profile.select_dtypes("number").nunique() < 5]
potential_categorical_from_numerical_profile

In [None]:
# Separating between discrete and continuous variables, as discrete ones could potentially be treated as categorical.
# Remember to adjust the threshold (in this case, < 20) based on your dataset's specific characteristics and domain knowledge.
potential_categorical_from_numerical_web= df_web_data.select_dtypes("number").loc[:, df_web_data.select_dtypes("number").nunique() < 5]
potential_categorical_from_numerical_web

In [None]:
# Separating between discrete and continuous variables, as discrete ones could potentially be treated as categorical.
# Remember to adjust the threshold (in this case, < 20) based on your dataset's specific characteristics and domain knowledge.
potential_categorical_from_numerical_exp_cli= df_experiment_clients.select_dtypes("number").loc[:, df_experiment_clients.select_dtypes("number").nunique() < 5]
potential_categorical_from_numerical_exp_cli

In [None]:
# Retrieving column names with object (typically string) data types from the dataframe
df_clients_profile.select_dtypes("object").columns

In [None]:
# Retrieving column names with object (typically string) data types from the dataframe
df_web_data.select_dtypes("object").columns

In [None]:
# Retrieving column names with object (typically string) data types from the dataframe
df_experiment_clients.select_dtypes("object").columns

In [None]:
df_clients_profile.columns

In [None]:
df_clients_profile['gendr'] = df_clients_profile['gendr'].replace(['U', 'X'], 'Other')

## Drop duplicates values

In [None]:
#clean df_clients_profile
df_clients_profile = print_clean_data(df_clients_profile)

In [None]:
#clean df_experiment_clients
df_experiment_clients = print_clean_data(df_experiment_clients)

In [None]:
#clean df_web_data
df_web_data = print_clean_data(df_web_data)

In [None]:
df_web_data.head(20)

# Data Typing/Formatting

In [None]:
df_clients_profile.head(10)

In [None]:
df_clients_profile.dtypes

In [None]:
df_experiment_clients.dtypes

In [None]:
df_web_data.dtypes

In [None]:
df_web_data['date_time'] = pd.to_datetime(df_web_data['date_time'], format='%Y-%m-%d %H:%M:%S')

In [None]:
df_web_data.dtypes

In [None]:
# Convert specified columns to integer data type
df_clients_profile['clnt_tenure_yr'] = df_clients_profile['clnt_tenure_yr'].astype(int)
df_clients_profile['clnt_tenure_mnth'] = df_clients_profile['clnt_tenure_mnth'].astype(int)
df_clients_profile['clnt_age'] = df_clients_profile['clnt_age'].astype(int)
df_clients_profile['num_accts'] = df_clients_profile['num_accts'].astype(int)
df_clients_profile['calls_6_mnth'] = df_clients_profile['calls_6_mnth'].astype(int)
df_clients_profile['logons_6_mnth'] = df_clients_profile['logons_6_mnth'].astype(int)

In [None]:
df_clients_profile.dtypes

## Create a data frame with the clients that won't participe in the experiment.

In [None]:
#setting aside the clients that wont be part if the experiment
null_df_experiment_clients = df_experiment_clients[df_experiment_clients['Variation'].isnull()]
null_df_experiment_clients

## Merge the clients in the web data

# df_test
### Identify the clients part of the test group

In [None]:
#id of clients that are part of the test
df_experiment_clients_Test = df_experiment_clients[df_experiment_clients['Variation'] == 'Test']
df_experiment_clients_Test.head(10)

### df_test: combine visit_id with visitor_id and reoganize the columns.

In [None]:
final_df_test = pd.merge(df_experiment_clients_Test, df_clients_profile, on='client_id', how='inner')
final_df_test = pd.merge(df_web_data, final_df_test, on='client_id', how='inner')
df_test = final_df_test
df_test.head(15)

### Create the csv file to df_test

In [None]:
df_test.to_csv('../Data/Cleaned_Data/df_test.csv', index=False)

# df_control
### Identify the clients part of the control group

In [None]:
#id of clients that are part of the test
df_experiment_clients_control = df_experiment_clients[df_experiment_clients['Variation'] == 'Control']
df_experiment_clients_control.head(10)

### df_control: combine visit_id with visitor_id and reoganize the columns.

In [None]:
final_df_control = pd.merge(df_experiment_clients_control, df_clients_profile, on='client_id', how='inner')
final_df_control = pd.merge(df_web_data, final_df_control, on='client_id', how='inner')
df_control = final_df_control
df_control.head(15)

### Create the csv file to df_control

In [None]:
df_control.to_csv('../Data/Cleaned_Data/df_control.csv', index=False)

# df_combine

# Combine df_test and df_control

In [None]:
df_combined = pd.concat([df_test, df_control]).reset_index(drop=True)
df_combined.to_csv('../Data/Cleaned_Data/df_combined.csv', index=False)

# Pivot and create new columns

# df_test to df_test_final

In [None]:
df_test.head(2)

In [None]:
# Assuming df_control is your DataFrame
# 1. Data Preparation:
df_test['date_time'] = pd.to_datetime(df_test['date_time'])
df_test['date'] = df_test['date_time'].dt.date
df_test['visit_visitor_id'] = df_test['visit_id'].astype(str) + "_" + df_test['visitor_id'].astype(str)

In [None]:
# 2. Calculate time differences per step and make them positive:
df_test = df_test.sort_values(by=['visit_id', 'client_id', 'date_time'])
df_test['next_date_time'] = df_test.groupby(['visit_id', 'client_id'])['date_time'].shift(-1)
df_test['time_diff_seconds'] = (df_test['next_date_time'] - df_test['date_time']).dt.total_seconds()
df_test['time_diff_minutes'] = df_test['time_diff_seconds'] / 60

# Get Last Step:
last_step_df = df_test.groupby('visit_visitor_id')['process_step'].last().reset_index()
last_step_df = last_step_df.rename(columns={'process_step': 'last_step'})

In [None]:
# 3. Calculate the number of steps per visit:
df_test = pd.concat([df_test, pd.get_dummies(df_test['process_step'], prefix='count')], axis=1)
df_steps_sum = df_test.groupby(by=['client_id', 'visit_id', 'visitor_id'])[['count_confirm', 
'count_start', 'count_step_1', 'count_step_2', 'count_step_3']].agg('sum')

In [None]:
# 4. Pivot the table to have one row per visit:
df_pivot = df_test.pivot_table(index=['client_id', 'visitor_id', 'visit_id'], columns='process_step', values='time_diff_minutes', fill_value=0, aggfunc='sum')
df_pivot = df_pivot.reset_index()
df_pivot.columns.name = None
test_time_counts = pd.merge(df_pivot, df_steps_sum, on=['client_id', 'visitor_id', 'visit_id'])


In [None]:
# 5. Merge the data with the clients' profile and get initial and final dates:
final_df_Test = pd.merge(test_time_counts, df_experiment_clients_Test, on='client_id', how='inner')
final_df_Test = pd.merge(final_df_Test, df_clients_profile, on='client_id', how='inner')

df_test_date = df_web_data.groupby(by = ['client_id', 'visit_id', 'visitor_id']).agg({'date_time': ['min', 'max']})
df_test_date.columns = [''.join(col).strip() if isinstance(col, tuple) else col for col in df_test_date.columns]
final_df_Test = pd.merge(df_test_date, final_df_Test, on=['client_id', 'visit_id', 'visitor_id'], how='inner')

# Ensure visit_visitor_id exists in final_df_Test for the merge:
final_df_Test['visit_visitor_id'] = final_df_Test['visit_id'].astype(str) + "_" + final_df_Test['visitor_id'].astype(str)  

# Calculate total time spent in the website:
final_df_Test['total_time_visit'] = (final_df_Test['date_timemax'] - final_df_Test['date_timemin']).dt.total_seconds() / 60

# Merge with last_step_df:
final_df_Test = pd.merge(final_df_Test, last_step_df, on='visit_visitor_id', how='left')

# Extract the date from date_timemax into a new column called date
final_df_Test['date'] = final_df_Test['date_timemax'].dt.date

# Remove unnecessary columns 
final_df_Test = final_df_Test.drop(columns=['visit_id', 'visitor_id'])

# Rename columns:
final_df_Test = final_df_Test.rename(columns={
    'date_timemin': 'initial_date', 
    'date_timemax': 'final_date', 
    'start': 'start_time', 
    'confirm': 'time_completion', 
    'count_confirm': 'completion',
    'count_start': 'start_step', 
    'count_step_1': '1st_step', 
    'count_step_2': '2nd_step', 
    'count_step_3': '3rd_step',
    'Variation': 'variation'
})

# Total navigations between start and last step (including start and last):
final_df_Test['navigations_bt_start_last'] = final_df_Test.apply(
    lambda row: row['start_step'] + row['1st_step'] + row['2nd_step'] + row['3rd_step'] + 1, 
    axis=1
)


In [None]:
# 6. Create the final order of columns:
new_order = ['client_id', 'visit_visitor_id', 'start_time', 'step_1', 'step_2', 'step_3', 'time_completion', 'navigations_bt_start_last', 
             'completion', 'start_step', '1st_step', '2nd_step', '3rd_step', 'last_step', 'variation', 'clnt_tenure_yr', 'clnt_tenure_mnth', 
             'clnt_age', 'gendr', 'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth', 'date', 'initial_date', 'total_time_visit', 'final_date']
df_test_final = final_df_Test[new_order]

# Save the final DataFrame to CSV
df_test_final.to_csv('../Data/Cleaned_Data/df_test_final.csv', index=False)

# Convert data types if necessary
df_test_final = convert_data_types_final(df_test_final)
df_test_final.dtypes

In [None]:
df_test_final.to_csv('../Data/Cleaned_Data/df_test_final.csv', index=False)

In [None]:
df_test_final = convert_data_types_final(df_test_final)

In [None]:
df_test_final.dtypes

### Create the csv file to df_test_final

In [None]:
df_test_final.to_csv('../Data/Cleaned_Data/df_test_final.csv', index=False)

# df_control to df_control_final

In [None]:
# 1. Data Preparation
df_control['date_time'] = pd.to_datetime(df_control['date_time'])
df_control['date'] = df_control['date_time'].dt.date
df_control['visit_visitor_id'] = df_control['visit_id'].astype(str) + "_" + df_control['visitor_id'].astype(str)

In [None]:
# 2. Calculate time differences per step and make them positive:
df_control = df_control.sort_values(by=['visit_id', 'client_id', 'date_time'])
df_control['next_date_time'] = df_control.groupby(['visit_id', 'client_id'])['date_time'].shift(-1)
df_control['time_diff_seconds'] = (df_control['next_date_time'] - df_control['date_time']).dt.total_seconds()
df_control['time_diff_minutes'] = df_control['time_diff_seconds'] / 60

# Get Last Step:
last_step_df_control = df_control.groupby('visit_visitor_id')['process_step'].last().reset_index()
last_step_df_control = last_step_df_control.rename(columns={'process_step': 'last_step'})


In [None]:
# 3. Calculate the number of steps per visit:
df_control = pd.concat([df_control, pd.get_dummies(df_control['process_step'], prefix='count')], axis=1)
df_steps_sum_control = df_control.groupby(by=['client_id', 'visit_id', 'visitor_id'])[['count_confirm', 
'count_start', 'count_step_1', 'count_step_2', 'count_step_3']].agg('sum')

In [None]:
# 4. Pivot the table to have one row per visit:
df_pivot_control = df_control.pivot_table(index=['client_id', 'visitor_id', 'visit_id'], columns='process_step', values='time_diff_minutes', fill_value=0, aggfunc='sum')
df_pivot_control = df_pivot_control.reset_index()
df_pivot_control.columns.name = None
control_time_counts = pd.merge(df_pivot_control, df_steps_sum_control, on=['client_id', 'visitor_id', 'visit_id'])

In [None]:
# 4. Pivot the table to have one row per visit:
df_pivot_control = df_control.pivot_table(index=['client_id', 'visitor_id', 'visit_id'], columns='process_step', values='time_diff_minutes', fill_value=0, aggfunc='sum')
df_pivot_control = df_pivot_control.reset_index()
df_pivot_control.columns.name = None
control_time_counts = pd.merge(df_pivot_control, df_steps_sum_control, on=['client_id', 'visitor_id', 'visit_id'])

In [None]:
# 5. Merge the data with the clients' profile and get initial and final dates:
final_df_Control = pd.merge(control_time_counts, df_experiment_clients, on='client_id', how='inner')
final_df_Control = pd.merge(final_df_Control, df_clients_profile, on='client_id', how='inner')

df_control_date = df_web_data.groupby(by = ['client_id', 'visit_id', 'visitor_id']).agg({'date_time': ['min', 'max']})
df_control_date.columns = [''.join(col).strip() if isinstance(col, tuple) else col for col in df_control_date.columns]
final_df_Control = pd.merge(df_control_date, final_df_Control, on=['client_id', 'visit_id', 'visitor_id'], how='inner')

# Ensure visit_visitor_id exists in final_df_Control for the merge:
final_df_Control['visit_visitor_id'] = final_df_Control['visit_id'].astype(str) + "_" + final_df_Control['visitor_id'].astype(str)  

# Calculate total time spent in the website:
final_df_Control['total_time_visit'] = (final_df_Control['date_timemax'] - final_df_Control['date_timemin']).dt.total_seconds() / 60

# Merge with last_step_df_control:
final_df_Control = pd.merge(final_df_Control, last_step_df_control, on='visit_visitor_id', how='left')

# Extract the date from date_timemax into a new column called date
final_df_Control['date'] = final_df_Control['date_timemax'].dt.date

# Remove unnecessary columns 
final_df_Control = final_df_Control.drop(columns=['visit_id', 'visitor_id'])

# Rename columns:
final_df_Control = final_df_Control.rename(columns={
    'date_timemin': 'initial_date', 
    'date_timemax': 'final_date', 
    'start': 'start_time', 
    'confirm': 'time_completion', 
    'count_confirm': 'completion',
    'count_start': 'start_step', 
    'count_step_1': '1st_step', 
    'count_step_2': '2nd_step', 
    'count_step_3': '3rd_step',
    'Variation': 'variation'
})

# Total navigations between start and last step (including start and last):
final_df_Control['navigations_bt_start_last'] = final_df_Control['start_step'] + final_df_Control['1st_step'] + final_df_Control['2nd_step'] + final_df_Control['3rd_step'] + 1

In [None]:
# 6. Create the final order of columns:
new_order_control = ['client_id', 'visit_visitor_id', 'start_time', 'step_1', 'step_2', 'step_3', 'time_completion', 'navigations_bt_start_last', 
                    'completion', 'start_step', '1st_step', '2nd_step', '3rd_step', 'last_step', 'variation', 'clnt_tenure_yr', 'clnt_tenure_mnth', 
                    'clnt_age', 'gendr', 'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth', 'date', 'initial_date', 'total_time_visit', 'final_date']
df_control_final = final_df_Control[new_order_control]

# Save the final DataFrame to CSV
df_control_final.to_csv('../Data/Cleaned_Data/df_control_final.csv', index=False)

# Convert data types if necessary
df_control_final = convert_data_types_final(df_control_final)

# New Data Frames

## df_control_final

In [None]:
df_control_final.head(10)

## df_test_final

In [None]:
df_test_final.head(10)

# df_final
### Create the csv file to combine_df

In [None]:
df_final = pd.concat([df_test_final, df_control_final], ignore_index=True)

In [None]:
df_final = convert_data_types(df_final)

In [None]:
df_final.to_csv('../Data/Cleaned_Data/df_final.csv', index=False)

In [None]:
df_final.sample(15)