# Pipeline Checks

Created to evalute each line of files .py and debug them easily in a controlled enviroment

## preprocess.py

### Configurations

In [1]:
import sys
from pathlib import Path

# Ruta a la raíz del proyecto
project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [2]:
import pandas as pd
import numpy as np
import src.config as cf

project path: C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn
raw data path: C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn\data\raw_data.csv
clean data path: C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn\data\clean_data.csv
model path C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn\models\model.pkl
results path C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn\results


### Data Cleaning

In [3]:
import pandas as pd

# Crear un DataFrame de prueba
data = {
    'TotalCharges': ['1000', '2000 ', 'NaN', ' 5000', None],
    'tenure': [1, 15, 40, 70, 5],
    'Contract': [' 1 year', '2 year ', 'Month-to-month', ' Month-to-month ', None],
    'MonthlyCharges': [20, -50, 100, 120, 80]
}

df = pd.DataFrame(data)

print("DataFrame Original:")
df

DataFrame Original:


Unnamed: 0,TotalCharges,tenure,Contract,MonthlyCharges
0,1000.0,1,1 year,20
1,2000.0,15,2 year,-50
2,,40,Month-to-month,100
3,5000.0,70,Month-to-month,120
4,,5,,80


In [4]:
from src.preprocess import DataCleaning

# Crear una instancia de la clase
dc = DataCleaning()

# Aplicar transformaciones
try:
    dc.fit(df)  # Ajustar cálculos necesarios (aunque en este caso no hace nada crítico)
    cleaned_df = dc.transform(df)

    print("\nDataFrame Limpio:")
    display(cleaned_df)

except Exception as e:
    print(f"Error: {e}")



DataFrame Limpio:


Unnamed: 0,TotalCharges,tenure,Contract,MonthlyCharges
0,1000.0,1,1 year,20
1,2000.0,15,2 year,-50
2,,40,Month-to-month,100
3,5000.0,70,Month-to-month,120
4,,5,,80


### Feature Engineering

In [5]:
import pandas as pd

# Crear un DataFrame de prueba
data = {
    'tenure': [1, 15, 40, 70],  # Diferentes valores de antigüedad
    'MonthlyCharges': [20, 50, 100, 120],  # Cargos mensuales variados
    'TotalCharges': [20, 750, 4000, 8500],  # Totales para verificar RevenueAdjustment
    'InternetService': ['DSL', 'Fiber optic', 'No', 'DSL'],  # Casos de simplificación
    'MultipleLines': ['No', 'Yes', 'No phone service', 'Yes'],  # Casos de simplificación
    'OnlineSecurity': ['Yes', 'No', 'No internet service', 'Yes']  # Casos de simplificación
}

df = pd.DataFrame(data)
df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,InternetService,MultipleLines,OnlineSecurity
0,1,20,20,DSL,No,Yes
1,15,50,750,Fiber optic,Yes,No
2,40,100,4000,No,No phone service,No internet service
3,70,120,8500,DSL,Yes,Yes


In [6]:
from src.preprocess import FeatureEngineering

# Crear una instancia de la clase
fe = FeatureEngineering()

In [7]:
# Aplicar las transformaciones
fe.fit(df)
transformed_df = fe.transform(df)

print("DataFrame Original:")
display(df)
print("\nDataFrame Transformado:")
display(transformed_df)

DataFrame Original:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,InternetService,MultipleLines,OnlineSecurity
0,1,20,20,DSL,No,Yes
1,15,50,750,Fiber optic,Yes,No
2,40,100,4000,No,No phone service,No internet service
3,70,120,8500,DSL,Yes,Yes



DataFrame Transformado:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,InternetService,MultipleLines,OnlineSecurity,TotalCost,RevenueAdjustment,LogMonthlyCharges
0,1,20,20,Yes,No,Yes,20,0,2.995732
1,15,50,750,Yes,Yes,No,750,0,3.912023
2,40,100,4000,No,No,No,4000,0,4.60517
3,70,120,8500,Yes,Yes,Yes,8400,100,4.787492


### Feature Selection

In [8]:
data = {
    'feature1': [1, 2, 3],
    'feature2': [4, 5, 6],
    'feature3': ['A', 'B', 'C'],
    'feature4': ['X', 'Y', 'Z'],
    'target': [0, 1, 0]
}
df = pd.DataFrame(data)

In [9]:
cf.num_features_to_drop = ['feature1', 'feature2']
cf.cat_features_to_drop = ['feature3']

In [10]:
from src.preprocess import FeatureSelector

selector = FeatureSelector()
selector.fit(df)
transformed_df = selector.transform(df)

print("Original DataFrame:")
print(df)
print("\nDataFrame después de Feature Selection:")
print(transformed_df)

Original DataFrame:
   feature1  feature2 feature3 feature4  target
0         1         4        A        X       0
1         2         5        B        Y       1
2         3         6        C        Z       0

DataFrame después de Feature Selection:
  feature4  target
0        X       0
1        Y       1
2        Z       0


### Outlier Detection

In [18]:
data = {
    'MonthlyCharges': [20, 50, 100, 1500],  # 1500 es un outlier
    'tenure': [1, 15, 40, 70],
    'TotalCharges': [20, 750, 4000, 85000]  # 85000 es un outlier
}
df = pd.DataFrame(data)

print("DataFrame Original:")
display(df)

DataFrame Original:


Unnamed: 0,MonthlyCharges,tenure,TotalCharges
0,20,1,20
1,50,15,750
2,100,40,4000
3,1500,70,85000


In [19]:
from src.preprocess import OutlierDetector

# Crear instancia de la clase
outlier_detector = OutlierDetector(multiplier=1.5, action="remove")

# Ajustar y transformar
outlier_detector.fit(df)
cleaned_df = outlier_detector.transform(df)

print("\nDataFrame sin outliers:")
display(cleaned_df)


DataFrame sin outliers:


Unnamed: 0,MonthlyCharges,tenure,TotalCharges
0,20,1,20
1,50,15,750
2,100,40,4000
