# Exploratory Data Analysis

## Initial Settings

In [11]:
import sys
from pathlib import Path

# Ruta a la raíz del proyecto
project_root = Path().resolve().parent
sys.path.append(str(project_root))

pd.options.display.max_columns = None

## Libraries and Dependencies

In [8]:
import src.utils as ut
import src.config as cf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Preliminary EDA

### Data Loading

In [None]:
df = ut.load_data(cf.raw_data_path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Basic Information

In [14]:
# Dataset size
print(f'Dimensions: {df.shape}')

# Datatypes and missing values
df.info()

# Statistical summary
df.describe()

Dimensions: (7043, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


### Uniqueness

In [26]:
def uniqueness_categorical_columns(df, max_categories = 10):
    """
    Analyzes categorical columns showing:
    1. number of unique values
    2. porcentual distribution of categories
    
    Parameters:
        df (pd.DataFrame): DataFrame to analyze
    """
    categorical_cols = df.select_dtypes(include = 'object').columns
    
    for col in categorical_cols:
        print(f'--- Columna: {col} ---')
        print(f'Unique values: {df[col].nunique()}')
        print('Distribution:')

        distribution = df[col].value_counts(normalize = True) * 100
        distribution_top = distribution.head(max_categories)
        
        print(distribution_top.to_string())
        if len(distribution) > max_categories:
            print(f'... showing the {max_categories} most common values')
        print("\n")


In [27]:
uniqueness_categorical_columns(df)

--- Columna: customerID ---
Unique values: 7043
Distribution:
7590-VHVEG    0.014198
3791-LGQCY    0.014198
6008-NAIXK    0.014198
5956-YHHRX    0.014198
5365-LLFYV    0.014198
5855-EIBDE    0.014198
8166-ZZTFS    0.014198
0129-KPTWJ    0.014198
9128-CPXKI    0.014198
9509-MPYOD    0.014198
... showing the 10 most common values


--- Columna: gender ---
Unique values: 2
Distribution:
Male      50.47565
Female    49.52435


--- Columna: Partner ---
Unique values: 2
Distribution:
No     51.69672
Yes    48.30328


--- Columna: Dependents ---
Unique values: 2
Distribution:
No     70.041176
Yes    29.958824


--- Columna: PhoneService ---
Unique values: 2
Distribution:
Yes    90.316626
No      9.683374


--- Columna: MultipleLines ---
Unique values: 3
Distribution:
No                  48.132898
Yes                 42.183729
No phone service     9.683374


--- Columna: InternetService ---
Unique values: 3
Distribution:
Fiber optic    43.958540
DSL            34.374556
No             21.66690

### Missing Values