In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.stats import ttest_ind, chi2_contingency
from scipy.stats import ttest_ind

In [None]:
import sys
sys.path.append('../src')
from functions import *

# Loading the Data

In [None]:
sys.path.append('../src')
df_test_final = pd.read_csv('../Data/raw/df_test_final.csv')
df_control_final = pd.read_csv('../Data/raw/df_control_final.csv')
pd.set_option('display.max_columns', None)

# df_control

## Check the data types

In [None]:
# Convert specified columns to integer data type
df_control_final['client_id'] = df_control_final['client_id'].astype(object)
df_control_final['visit_visitor_id'] = df_control_final['visit_visitor_id'].astype(object)
df_control_final['start_time'] = round(df_control_final['start_time'].astype(float),2)
df_control_final['step_1'] = round(df_control_final['step_1'].astype(float),2)
df_control_final['step_2'] = round(df_control_final['step_2'].astype(float),2)
df_control_final['step_3'] = round(df_control_final['step_3'].astype(float),2)
df_control_final['date'] = pd.to_datetime(df_control_final.date)
df_control_final['bt_1'] = df_control_final['bt_1'].astype(int)
df_control_final['bt_2'] = df_control_final['bt_2'].astype(int)
df_control_final['bt_3'] = df_control_final['bt_3'].astype(int)
df_control_final['total_navigation'] = df_control_final['total_navigation'].astype(int)
df_control_final['last_step'] = df_control_final['last_step'].astype(object)
df_control_final['completion'] = df_control_final['completion'].astype(bool)
df_control_final['total_time_visit'] = round(df_control_final['total_time_visit'].astype(float),2)
df_control_final['Variation'] = df_control_final['Variation'].astype(object)
df_control_final['clnt_tenure_yr'] = df_control_final['clnt_tenure_yr'].astype(int)
df_control_final['clnt_tenure_mnth'] = df_control_final['clnt_tenure_mnth'].astype(int)
df_control_final['clnt_age'] = df_control_final['clnt_age'].astype(int)
df_control_final['gendr'] = df_control_final['gendr'].astype(object)
df_control_final['num_accts'] = df_control_final['num_accts'].astype(int)
df_control_final['calls_6_mnth'] = df_control_final['calls_6_mnth'].astype(int)
df_control_final['logons_6_mnth'] = df_control_final['logons_6_mnth'].astype(int)

In [None]:
df_control_final.dtypes

## Categorical columns

In [None]:
# Extracting column names with numerical data types from the dataframe
df_control_final.select_dtypes("object").columns


In [None]:
test_categorical_columns = ['last_step', 'gendr']

In [None]:
# Extracting column names with numerical data types from the dataframe
df_control_final.select_dtypes("object").nunique().sort_values(ascending=False)

In [None]:
frequency_proportion(df_control_final, 'last_step')

In [None]:
frequency_proportion(df_control_final, 'gendr')

In [None]:
cross_table(df_control_final, 'start_time')

In [None]:
frequency_proportion(df_control_final, 'start_time')

In [None]:
tab_control_last_step = cross_table(df_control_final, 'last_step')
tab_control_last_step

In [None]:
# Calculating the proportions for each value in 'tab_test_last_step' and rounding the results to two decimal places
(tab_control_last_step['count'] / tab_control_last_step['count'].sum()).round(2)

In [None]:
tab_control_last_step.plot(x='last_step', y='count', kind='bar')
plt.xlabel('Last Step')
plt.ylabel('Count')
plt.title('Last Step Distribution in Control Group')
plt.show()

In [None]:
tab_control_gender = cross_table(df_control_final, 'gendr')
tab_control_gender

In [None]:
# Calculating the proportions for each value in 'tab_test_last_step' and rounding the results to two decimal places
(tab_control_gender['count'] / tab_control_gender['count'].sum()).round(2)

In [None]:
tab_control_gender.plot.pie(y='count', labels=tab_control_gender['gendr'], autopct='%1.1f%%')
plt.title('Gender Distribution in Control Group')
plt.axis('equal')
plt.show()

## Numerical columns

In [None]:
# Extracting column names with numerical data types from the dataframe
df_control_final.select_dtypes("number").columns

In [None]:
df_control_final.dtypes

In [None]:
control_numerical_columns = ['start_time', 'step_1', 'step_2', 'step_3', 'bt_1', 'bt_2', 'bt_3', 'total_navigation', 'total_time_visit', 'clnt_tenure_yr','clnt_tenure_mnth', 'clnt_age', 'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth']
control_numerical_columns

In [None]:
# Extracting column names with numerical data types from the dataframe
df_control_final.select_dtypes("number").nunique().sort_values(ascending=False)

In [None]:
df_control_final.describe()

In [None]:
# Plot histograms for each numeric column with values on top
for column in control_numerical_columns:
    plt.figure()
    df_control_final[column].hist(bins=35)
    plt.title(f'Chart: {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

## Check for outliers

In [None]:
control_numerical_columns

In [None]:
for column in control_numerical_columns:
    plt.figure()
    df_test_final.boxplot(column=column)
    plt.title(f'Box plot of {column}')
    plt.show()

# FINAL OF THE CONTROL


# df_test

## Check the data types

## Univariate Analysis - df_test

In [None]:
# Convert specified columns to integer data type
df_test_final['client_id'] = df_test_final['client_id'].astype(object)
df_test_final['visit_visitor_id'] = df_test_final['visit_visitor_id'].astype(object)
df_test_final['start_time'] = round(df_test_final['start_time'].astype(float),2)
df_test_final['step_1'] = round(df_test_final['step_1'].astype(float),2)
df_test_final['step_2'] = round(df_test_final['step_2'].astype(float),2)
df_test_final['step_3'] = round(df_test_final['step_3'].astype(float),2)
df_test_final['date'] = pd.to_datetime(df_test_final.date)
df_test_final['bt_1'] = df_test_final['bt_1'].astype(int)
df_test_final['bt_2'] = df_test_final['bt_2'].astype(int)
df_test_final['bt_3'] = df_test_final['bt_3'].astype(int)
df_test_final['total_navigation'] = df_test_final['total_navigation'].astype(int)
df_test_final['last_step'] = df_test_final['last_step'].astype(object)
df_test_final['completion'] = df_test_final['completion'].astype(bool)
df_test_final['total_time_visit'] = round(df_test_final['total_time_visit'].astype(float),2)
df_test_final['Variation'] = df_test_final['Variation'].astype(object)
df_test_final['clnt_tenure_yr'] = df_test_final['clnt_tenure_yr'].astype(int)
df_test_final['clnt_tenure_mnth'] = df_test_final['clnt_tenure_mnth'].astype(int)
df_test_final['clnt_age'] = df_test_final['clnt_age'].astype(int)
df_test_final['gendr'] = df_test_final['gendr'].astype(object)
df_test_final['num_accts'] = df_test_final['num_accts'].astype(int)
df_test_final['calls_6_mnth'] = df_test_final['calls_6_mnth'].astype(int)
df_test_final['logons_6_mnth'] = df_test_final['logons_6_mnth'].astype(int)

In [None]:
df_test_final.dtypes


## Categorical columns

In [None]:
# Extracting column names with numerical data types from the dataframe
df_test_final.select_dtypes("object").columns


In [None]:
test_categorical_columns = ['last_step', 'gendr']

In [None]:
# Extracting column names with numerical data types from the dataframe
df_test_final.select_dtypes("object").nunique().sort_values(ascending=False)

In [None]:
frequency_proportion(df_test_final, 'last_step')

In [None]:
frequency_proportion(df_test_final, 'gendr')

In [None]:
cross_table(df_test_final, 'start_time')

In [None]:
frequency_proportion(df_test_final, 'start_time')

In [None]:
tab_test_last_step = cross_table(df_test_final, 'last_step')
tab_test_last_step

In [None]:
# Calculating the proportions for each value in 'tab_test_last_step' and rounding the results to two decimal places
(tab_test_last_step['count'] / tab_test_last_step['count'].sum()).round(2)

In [None]:
tab_test_last_step.plot(x='last_step', y='count', kind='bar')
plt.xlabel('Last Step')
plt.ylabel('Count')
plt.title('Last Step Distribution in Test Group')
plt.show()

In [None]:

tab_test_gender = cross_table(df_test_final, 'gendr')
tab_test_gender


In [None]:
# Calculating the proportions for each value in 'tab_test_last_step' and rounding the results to two decimal places
(tab_test_gender['count'] / tab_test_gender['count'].sum()).round(2)

In [None]:
tab_test_gender.plot.pie(y='count', labels=tab_test_gender['gendr'], autopct='%1.1f%%')
plt.title('Gender Distribution in Test Group')
plt.axis('equal')
plt.show()

## Numerical columns

### Centrality and Dispersion Measures

In [None]:
# Extracting column names with numerical data types from the dataframe
df_test_final.select_dtypes("number").columns

In [None]:
df_test_final.dtypes

In [None]:

test_numerical_columns = ['start_time', 'step_1', 'step_2', 'step_3', 'bt_1', 'bt_2', 'bt_3', 'total_navigation', 'total_time_visit', 'clnt_tenure_yr','clnt_tenure_mnth', 'clnt_age', 'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth']

In [None]:
test_numerical_columns

In [None]:
# Extracting column names with numerical data types from the dataframe
df_test_final.select_dtypes("number").nunique().sort_values(ascending=False)


In [None]:
df_test_final.describe()

In [None]:
# Plot histograms for each numeric column with values on top
for column in test_numerical_columns:
    plt.figure()
    df_test_final[column].hist(bins=35)
    plt.title(f'Chart: {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

## Check for outliers

In [None]:
test_numerical_columns

In [None]:
for column in test_numerical_columns:
    plt.figure()
    df_test_final.boxplot(column=column)
    plt.title(f'Box plot of {column}')
    plt.show()

In [None]:
test_numerical_columns