# Exploratory Data Analysis

## Initial Settings

### Roots and Paths

In [None]:
import sys
from pathlib import Path

# Ruta a la raíz del proyecto
project_root = Path().resolve().parent
sys.path.append(str(project_root))

### Libraries and Dependencies

In [None]:
import src.utils as ut
import src.config as cf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

pd.options.display.max_columns = None
sns.set_style('dark')
sns.set_palette('dark')

### Data Loading

In [None]:
df = ut.load_data(cf.raw_data_path)
df.head()

## Preliminary EDA

### Basic Information

In [None]:
# dataset size
print(f'Dimensions: {df.shape}')

# summary table
ut.summary_info(df)

### Missing Values

In [None]:
ut.missing_values_by_pivot(df, 'StreamingTV', return_type='percentage', percentage_base= 'total')

## Numerical Feature Analysis

### Selection of numerical features

In [None]:
numerical_cols = df.select_dtypes(include = ['int64', 'float64']).columns

### Statistical Summary

In [None]:
# statistical summary
df.describe().applymap(lambda x: ut.format_cell(x)).style.applymap(ut.center_align)

### Histograms

In [None]:
# create histogram and density curves

for col in numerical_cols:
    sns.histplot(df[col], kde = True, bins = 35)
    plt.title(f'{col} distribution')
    plt.xlabel(col)
    plt.ylabel('Frecuencia')
    plt.show()

### Box Plots

In [None]:
for col in numerical_cols:
    sns.boxplot(y = df[col])
    plt.title(f'{col} distribution')
    plt.ylabel({col})
    plt.show()

### Pair Plots

In [None]:
sns.pairplot(df[numerical_cols])
plt.show()

### Heatmap

In [None]:
correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize = (10, 8))
sns.heatmap(correlation_matrix, annot = True, fmt = ".2f", cmap = 'Blues', cbar = True)
plt.title('Correlation Matrix')
plt.show()

## Categorical Feature Analysis

### Selection of Categorical Features

In [None]:
cat_columns = list(set(df.columns) - set(numerical_cols))
cat_columns.remove('TotalCharges')
cat_columns.remove('customerID')

In [None]:
cat_columns

### Uniqueness

In [None]:
ut.uniqueness_categorical_columns(df, max_categories = 10)

### Count Plot

In [None]:
for col in cat_columns:
    sns.countplot(x = col, data = df)
    plt.title(f'{col} distribution')
    plt.show()

### Crosstabs

In [None]:
#cross_tabs_cols = cat_columns.remove('Churn')
for col in cat_columns:
    cross = pd.crosstab(df[col], df['Churn'])

    sns.heatmap(cross, annot = True, fmt = 'd', cmap = 'Blues', cbar= False)
    plt.show()

In [None]:
contract_churn = pd.crosstab([df['Contract'], df["PaymentMethod"]], df['Churn'])

sns.heatmap(contract_churn, annot = True, fmt = 'd', cmap = 'Blues')

In [None]:
from statsmodels.graphics.mosaicplot import mosaic

# Mosaic plot entre Contract y Churn
mosaic(df, ['Contract', 'Churn'], title="Contract vs Churn distribution")
plt.show()

## Cross-Variable Feature Analysis

### Swarm Plots

In [None]:
sns.stripplot(x = 'Contract', y ='MonthlyCharges', data = df, size = 3)
plt.show()

### Catplots

In [None]:
sns.catplot(x = 'Contract', y = 'MonthlyCharges', hue = 'Churn', col = 'InternetService', kind = 'box', data = df)
plt.subplots_adjust(top = 0.9)
plt.show()

### Scatter Plot

In [None]:
sns.scatterplot(x = 'tenure', y = 'MonthlyCharges', hue = 'Contract', style = 'Churn', data = df)
plt.show()

### Categorical Line Plots

In [None]:
sns.lineplot(x = 'tenure', y ='MonthlyCharges', hue = 'Contract', data = df, errorbar = None)
plt.show()

### Stacked Histograms

In [None]:
for contract in df['Contract'].unique():
    subset = df[df['Contract'] == contract]
    sns.histplot(subset['MonthlyCharges'], label = contract, kde = True, alpha = 0.5)
plt.legend()
plt.show()

### Ridge Plots

In [None]:
# from joypy import joyplot
# joyplot(
#     data = df[['Contract', 'MonthlyCharges']].sort_values('Contract'),
#     by = 'Contract',
#     column = 'MonthlyCharges',
#     colormap = 'coolwarm',
#     fade = True,
#     figsize = (10, 6)
# )
# plt.show()

### Interactions Heatmap

In [None]:
pivot_table = pd.pivot_table(
    df,
    values = 'MonthlyCharges',
    index = 'InternetService',
    columns = 'Contract',
    aggfunc = 'mean'   
)

plt.figure(figsize = (5, 3))
sns.heatmap(pivot_table, annot = True, fmt = '.2f', linewidths = 0.5)
plt.title('Average Monthly Charge by Contract Type and Internet Service')
plt.xlabel('Contract')
plt.ylabel('Internet Service')
plt.show()