In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
# Define file paths for train and test datasets
train_path = "data/train.csv"
test_path = "data/test.csv"

# Load train and test data into pandas DataFrames
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Combine train and test datasets vertically into a single DataFrame
# axis=0 means stack vertically, ignore_index=True resets the index
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# Display first 5 rows of the combined DataFrame to verify the data
df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [18]:
""" 
Q- let's Rename the columns and remove the empty space oky 

"""

df.columns = [col.lower().replace(' ','_') for col in df.columns]
df.columns

Index(['customerid', 'age', 'gender', 'tenure', 'usage_frequency',
       'support_calls', 'payment_delay', 'subscription_type',
       'contract_length', 'total_spend', 'last_interaction', 'churn'],
      dtype='object')

In [19]:
""" 
Q- let's remove the columns with unncessry information 
A- it's the CustomerID column oky 

"""
df.drop(columns='customerid',inplace=True)
df.shape

(505207, 11)

In [20]:
from typing import Callable, List
import pandas as pd

def compute_null_percentage(column: pd.Series) -> float:
    """
    Compute the percentage of null values in a given column.

    Args:
        column (pd.Series): The input column to analyze.

    Returns:
        float: The percentage of null values in the column.
    """
    null_count: int = column.isnull().sum()
    total_count: int = len(column)

    return (null_count / total_count) * 100 if total_count > 0 else 0.0

def analyze_null_percentages(df: pd.DataFrame) -> pd.Series:
    """
    Analyze null percentages for all columns in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to analyze.

    Returns:
        pd.Series: A series of null percentages for each column.
    """
    return df.isnull().mean() * 100

# Call the analyze_null_percentages function
null_percentages = analyze_null_percentages(df)

# Print the results
for column, percentage in null_percentages.items():
    print(f"{column}: {percentage:.2f}% null values")

age: 0.00% null values
gender: 0.00% null values
tenure: 0.00% null values
usage_frequency: 0.00% null values
support_calls: 0.00% null values
payment_delay: 0.00% null values
subscription_type: 0.00% null values
contract_length: 0.00% null values
total_spend: 0.00% null values
last_interaction: 0.00% null values
churn: 0.00% null values


In [21]:
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def train_and_save_model(X_train, X_test, y_train, y_test, model_filename="model_pipeline.joblib"):
    """
    Trains a Logistic Regression model with preprocessing steps, performs hyperparameter tuning,
    evaluates the model, and saves the entire pipeline to a file.

    Args:
        X_train: Training data features as DataFrame.
        y_train: Training data labels.
        X_test: Test data features as DataFrame.
        y_test: Test data labels.
        model_filename (str): The name of the file to save the model pipeline to.

    Returns:
        dict: A dictionary containing the evaluation metrics, or None if an error occurs.
    """
    try:
        # Define numerical and categorical columns
        numerical_cols = ['age', 'tenure', 'usage_frequency', 'support_calls', 'payment_delay', 'total_spend', 'last_interaction']
        categorical_cols = ['gender', 'subscription_type', 'contract_length']

        # Create preprocessing steps
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_cols),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
            ],
            remainder='drop'  # Explicitly drop other columns
        )

        # Create a pipeline that includes preprocessing and the model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', LogisticRegression(solver='liblinear'))
        ])

        # Define parameter grid for GridSearchCV
        param_grid = {
            'model__C': [0.001, 0.01, 0.1, 1, 10],
            'model__penalty': ['l1', 'l2'],
        }

        # Initialize GridSearchCV
        grid = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(5), scoring='roc_auc')
        grid.fit(X_train, y_train)
        best_pipeline = grid.best_estimator_

        # Save the entire pipeline
        joblib.dump(best_pipeline, model_filename)
        print(f"Pipeline saved to {model_filename}")

        # Make predictions and calculate performance metrics
        y_pred = best_pipeline.predict(X_test)
        y_pred_proba = best_pipeline.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        metrics = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'ROC AUC': roc_auc
        }

        return metrics

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [22]:
from sklearn.model_selection import train_test_split
from tabulate import tabulate
# Separate features and target
X = df.drop("churn", axis=1)
y = df["churn"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train and save the model
result = train_and_save_model(X_train, X_test, y_train, y_test)
if result:
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']
    print(tabulate([[result[metric] for metric in metrics]], headers=metrics, floatfmt=".4f"))

ValueError: Input y contains NaN.