# Pipeline Checks

Created to evalute each line of files .py and debug them easily in a controlled enviroment

## preprocess.py

### Configurations

In [1]:
import sys
from pathlib import Path

# Ruta a la raíz del proyecto
project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [2]:
import pandas as pd
import numpy as np
import src.config as cf

project path: C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn
raw data path: C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn\data\raw_data.csv
clean data path: C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn\data\clean_data.csv
model path C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn\models\model.pkl
results path C:\Users\Usuario\OneDrive\Documentos\Proyectos Personales\Data Science\Public Portfolio Data Science\241117 Telco Customer Churn\results


### Data Cleaning

In [3]:
import pandas as pd

# Crear un DataFrame de prueba
data = {
    'TotalCharges': ['1000', '2000 ', 'NaN', ' 5000', None],
    'tenure': [1, 15, 40, 70, 5],
    'Contract': [' 1 year', '2 year ', 'Month-to-month', ' Month-to-month ', None],
    'MonthlyCharges': [20, -50, 100, 120, 80]
}

df = pd.DataFrame(data)

print("DataFrame Original:")
df

DataFrame Original:


Unnamed: 0,TotalCharges,tenure,Contract,MonthlyCharges
0,1000.0,1,1 year,20
1,2000.0,15,2 year,-50
2,,40,Month-to-month,100
3,5000.0,70,Month-to-month,120
4,,5,,80


In [4]:
from src.preprocess import DataCleaning

# Crear una instancia de la clase
dc = DataCleaning()

# Aplicar transformaciones
try:
    dc.fit(df)  # Ajustar cálculos necesarios (aunque en este caso no hace nada crítico)
    cleaned_df = dc.transform(df)

    print("\nDataFrame Limpio:")
    display(cleaned_df)

except Exception as e:
    print(f"Error: {e}")



DataFrame Limpio:


Unnamed: 0,TotalCharges,tenure,Contract,MonthlyCharges
0,1000.0,1,1 year,20
1,2000.0,15,2 year,-50
2,,40,Month-to-month,100
3,5000.0,70,Month-to-month,120
4,,5,,80


### Feature Engineering

In [5]:
import pandas as pd

# Crear un DataFrame de prueba
data = {
    'tenure': [1, 15, 40, 70],  # Diferentes valores de antigüedad
    'MonthlyCharges': [20, 50, 100, 120],  # Cargos mensuales variados
    'TotalCharges': [20, 750, 4000, 8500],  # Totales para verificar RevenueAdjustment
    'InternetService': ['DSL', 'Fiber optic', 'No', 'DSL'],  # Casos de simplificación
    'MultipleLines': ['No', 'Yes', 'No phone service', 'Yes'],  # Casos de simplificación
    'OnlineSecurity': ['Yes', 'No', 'No internet service', 'Yes']  # Casos de simplificación
}

df = pd.DataFrame(data)
df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,InternetService,MultipleLines,OnlineSecurity
0,1,20,20,DSL,No,Yes
1,15,50,750,Fiber optic,Yes,No
2,40,100,4000,No,No phone service,No internet service
3,70,120,8500,DSL,Yes,Yes


In [6]:
from src.preprocess import FeatureEngineering

# Crear una instancia de la clase
fe = FeatureEngineering()

In [7]:
# Aplicar las transformaciones
fe.fit(df)
transformed_df = fe.transform(df)

print("DataFrame Original:")
display(df)
print("\nDataFrame Transformado:")
display(transformed_df)

DataFrame Original:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,InternetService,MultipleLines,OnlineSecurity
0,1,20,20,DSL,No,Yes
1,15,50,750,Fiber optic,Yes,No
2,40,100,4000,No,No phone service,No internet service
3,70,120,8500,DSL,Yes,Yes



DataFrame Transformado:


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,InternetService,MultipleLines,OnlineSecurity,TotalCost,RevenueAdjustment,LogMonthlyCharges
0,1,20,20,Yes,No,Yes,20,0,2.995732
1,15,50,750,Yes,Yes,No,750,0,3.912023
2,40,100,4000,No,No,No,4000,0,4.60517
3,70,120,8500,Yes,Yes,Yes,8400,100,4.787492


### Feature Selection

In [8]:
data = {
    'feature1': [1, 2, 3],
    'feature2': [4, 5, 6],
    'feature3': ['A', 'B', 'C'],
    'feature4': ['X', 'Y', 'Z'],
    'target': [0, 1, 0]
}
df = pd.DataFrame(data)

In [9]:
cf.num_features_to_drop = ['feature1', 'feature2']
cf.cat_features_to_drop = ['feature3']

In [10]:
from src.preprocess import FeatureSelector

selector = FeatureSelector()
selector.fit(df)
transformed_df = selector.transform(df)

print("Original DataFrame:")
display(df)
print("\nDataFrame después de Feature Selection:")
display(transformed_df)

Original DataFrame:


Unnamed: 0,feature1,feature2,feature3,feature4,target
0,1,4,A,X,0
1,2,5,B,Y,1
2,3,6,C,Z,0



DataFrame después de Feature Selection:


Unnamed: 0,feature4,target
0,X,0
1,Y,1
2,Z,0


### Outlier Detection

In [11]:
data = {
    'MonthlyCharges': [20, 50, 100, 1500],  # 1500 es un outlier
    'tenure': [1, 15, 40, 70],
    'TotalCharges': [20, 750, 4000, 85000]  # 85000 es un outlier
}
df = pd.DataFrame(data)

print("DataFrame Original:")
display(df)

DataFrame Original:


Unnamed: 0,MonthlyCharges,tenure,TotalCharges
0,20,1,20
1,50,15,750
2,100,40,4000
3,1500,70,85000


In [12]:
from src.preprocess import OutlierDetector

# Crear instancia de la clase
outlier_detector = OutlierDetector(multiplier=1.5, action="remove")

# Ajustar y transformar
outlier_detector.fit(df)
cleaned_df = outlier_detector.transform(df)

print("\nDataFrame sin outliers:")
display(cleaned_df)


DataFrame sin outliers:


Unnamed: 0,MonthlyCharges,tenure,TotalCharges
0,20,1,20
1,50,15,750
2,100,40,4000


### Missing Values Handling

In [13]:
data = {
    "age": [25, 30, None, 40],
    "income": [50000, None, 45000, None],
    "gender": ["Male", None, "Female", "Male"]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)

Original DataFrame:


Unnamed: 0,age,income,gender
0,25.0,50000.0,Male
1,30.0,,
2,,45000.0,Female
3,40.0,,Male


In [14]:
from src.preprocess import MissingValuesHandler

# Using default strategies from config.py
mvh_default = MissingValuesHandler()
mvh_default.fit(df)
cleaned_df_default = mvh_default.transform(df)

print("\nDataFrame with default strategies:")
display(cleaned_df_default)

# Overriding the strategy for numerical columns
mvh_override = MissingValuesHandler(numerical_strategy="mean")
mvh_override.fit(df)
cleaned_df_override = mvh_override.transform(df)

print("\nDataFrame with overridden numerical strategy (mean):")
display(cleaned_df_override)


DataFrame with default strategies:


Unnamed: 0,age,income,gender
0,25.0,50000.0,Male
1,30.0,47500.0,Male
2,30.0,45000.0,Female
3,40.0,47500.0,Male



DataFrame with overridden numerical strategy (mean):


Unnamed: 0,age,income,gender
0,25.0,50000.0,Male
1,30.0,47500.0,Male
2,31.666667,45000.0,Female
3,40.0,47500.0,Male


### Encoding

In [15]:
import pandas as pd

data = {
    "Churn": ["Yes", "No", "Yes", "No"],
    "InternetService": ["Yes", "No", "Yes", "No"],  # Columna simplificada
    "Contract": ["Month-to-month", "Two year", "One year", "Month-to-month"],
    "PaymentMethod": ["Credit card", "Bank transfer", "Electronic check", "Mailed check"],
    "MultipleLines": ["Yes", "No", "Yes", "No"],  # Columna simplificada
    "OnlineSecurity": ["No", "No", "Yes", "Yes"]  # Columna simplificada
}

df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)


Original DataFrame:


Unnamed: 0,Churn,InternetService,Contract,PaymentMethod,MultipleLines,OnlineSecurity
0,Yes,Yes,Month-to-month,Credit card,Yes,No
1,No,No,Two year,Bank transfer,No,No
2,Yes,Yes,One year,Electronic check,Yes,Yes
3,No,No,Month-to-month,Mailed check,No,Yes


In [16]:
from src.preprocess import CategoricalEncoder

# Crear instancia de la clase
encoder = CategoricalEncoder()

# Aplicar encoding
encoded_df = encoder.fit_transform(df)

print("\nDataFrame después del Encoding:")
display(encoded_df)


binary encoding applied to : MultipleLines
binary encoding applied to : InternetService
binary encoding applied to : OnlineSecurity
binary encoding applied to : Churn
ordinal encoding applied to : Contract
one - hot encoding applied to : PaymentMethod

DataFrame después del Encoding:


Unnamed: 0,Churn,InternetService,Contract,MultipleLines,OnlineSecurity,PaymentMethod_Bank transfer,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,1,0,1,0,0,1,0,0
1,0,0,1,0,0,1,0,0,0
2,1,1,2,1,1,0,0,1,0
3,0,0,0,0,1,0,0,0,1


### Scaling

In [17]:
data = {
    "Feature1": [10, 20, 30, 40, 50],
    "Feature2": [100, 200, 300, 400, 500],
    "Target": [0, 0, 0, 0, 1]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)

Original DataFrame:


Unnamed: 0,Feature1,Feature2,Target
0,10,100,0
1,20,200,0
2,30,300,0
3,40,400,0
4,50,500,1


In [20]:
from src.preprocess import Scaling

# Crear una instancia de la clase
scaler = Scaling(feature_range=(0, 1))

# Ajustar el scaler al dataset
scaler.fit(df)

# Transformar el dataset
scaled_df = scaler.transform(df)

print("\nDataFrame después de Min-Max Scaling:")
display(scaled_df)



DataFrame después de Min-Max Scaling:


Unnamed: 0,Feature1,Feature2,Target
0,0.0,0.0,0.0
1,0.25,0.25,0.0
2,0.5,0.5,0.0
3,0.75,0.75,0.0
4,1.0,1.0,1.0


### Data Augmentation

In [20]:
data = {
    "Feature1": [1, 2, 3, 4, 5],
    "Feature2": [10, 20, 30, 40, 50],
    "Target": [0, 1, 0, 0, 1]  # Clase minoritaria
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)


Original DataFrame:
   Feature1  Feature2  Target
0         1        10       0
1         2        20       1
2         3        30       0
3         4        40       0
4         5        50       1


In [22]:
from src.preprocess import DataAugmentation

# Crear instancia para oversampling
augmenter_oversample = DataAugmentation(method="oversample", target_col="Target")

# Aplicar Data Augmentation
augmented_df_oversample = augmenter_oversample.fit_transform(df)

print("\nDataFrame después de Oversampling:")
display(augmented_df_oversample)



DataFrame después de Oversampling:


Unnamed: 0,Feature1,Feature2,Target
0,1,10,0
1,2,20,1
2,3,30,0
3,4,40,0
4,5,50,1
5,5,50,1
6,2,20,1
7,5,50,1


In [24]:
# Crear instancia de DataAugmentation con k_neighbors ajustado
augmenter_smote = DataAugmentation(method="smote", target_col="Target", k_neighbors=1)

# Aplicar Data Augmentation
augmented_df_smote = augmenter_smote.fit_transform(df)

print("\nDataFrame después de SMOTE:")
display(augmented_df_smote)



DataFrame después de SMOTE:


Unnamed: 0,Feature1,Feature2,Target
0,1,10,0
1,2,20,1
2,3,30,0
3,4,40,0
4,5,50,1
5,2,27,1


In [25]:
print(augmented_df_oversample["Target"].value_counts())
print(augmented_df_smote["Target"].value_counts())

1    5
0    3
Name: Target, dtype: int64
0    3
1    3
Name: Target, dtype: int64


### Complete Pipeline

In [21]:
import pandas as pd
import numpy as np

# Dataset ficticio
data = {
    "CustomerID": [1, 2, 3, 4, 5, 6, 7, 8],
    "tenure": [12, 24, 36, 48, 60, np.nan, 84, 96],
    "MonthlyCharges": [20.5, 30.2, 40.1, 50.0, 60.3, 70.1, np.nan, 90.5],
    "TotalCharges": ["245", "nan", "1352", "2400", "3015", "nan", "7020", "8500"],
    "InternetService": ["DSL", "Fiber optic", "No", "DSL", "Fiber optic", "DSL", "No", "Fiber optic"],
    "MultipleLines": ["Yes", "No phone service", "No", "Yes", "Yes", "No", "No phone service", "Yes"],
    "OnlineSecurity": ["No", "No internet service", "Yes", "Yes", "No", "No internet service", "No", "Yes"],
    "Contract": ["Month-to-month", "Two year", "One year", "Month-to-month", "Two year", "Month-to-month", "Two year", "One year"],
    "Churn": ["No", "Yes", "No", "No", "Yes", "No", "No", "Yes"]
}

df = pd.DataFrame(data)
print("Dataset Ficticio:")
display(df)


Dataset Ficticio:


Unnamed: 0,CustomerID,tenure,MonthlyCharges,TotalCharges,InternetService,MultipleLines,OnlineSecurity,Contract,Churn
0,1,12.0,20.5,245.0,DSL,Yes,No,Month-to-month,No
1,2,24.0,30.2,,Fiber optic,No phone service,No internet service,Two year,Yes
2,3,36.0,40.1,1352.0,No,No,Yes,One year,No
3,4,48.0,50.0,2400.0,DSL,Yes,Yes,Month-to-month,No
4,5,60.0,60.3,3015.0,Fiber optic,Yes,No,Two year,Yes
5,6,,70.1,,DSL,No,No internet service,Month-to-month,No
6,7,84.0,,7020.0,No,No phone service,No,Two year,No
7,8,96.0,90.5,8500.0,Fiber optic,Yes,Yes,One year,Yes


In [23]:
from sklearn.pipeline import Pipeline
from src.preprocess import DataAugmentation

preprocessing_pipeline = Pipeline([
    ('feature_selection', FeatureSelector()),
    ('data_cleaning', DataCleaning()),
    ('feature_engineering', FeatureEngineering()),
    ('outlier_detection', OutlierDetector()),
    ('missing_values', MissingValuesHandler()),
    ('data_augmentation', DataAugmentation(method = 'smote', target_col= 'Target', active = True)),
    ('scaling', Scaling())
])

# Usar el pipeline
preprocessing_pipeline.fit(df)
processed_df = preprocessing_pipeline.transform(df)

MissingColumnError: The following missing columns are missing: feature3, feature1, feature2