In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# Load and prepare data
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Basic data cleaning
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
    return df

# Define features and target
def prepare_features_target(df):
    X = df.drop(['customerID', 'Churn'], axis=1)
    y = df['Churn'].map({'Yes': 1, 'No': 0})
    return X, y

# Create preprocessing pipeline
def create_preprocessor():
    # Identify categorical and numerical columns
    categorical_features = [
        'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
        'PaperlessBilling', 'PaymentMethod'
    ]
    numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), 
             categorical_features)
        ])
    return preprocessor

# Create and train pipeline
def train_pipeline(X, y, model_type='logistic'):
    # Create preprocessor
    preprocessor = create_preprocessor()
    
    # Select model and parameters based on model_type
    if model_type == 'logistic':
        model = LogisticRegression(random_state=42)
        param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'classifier__solver': ['lbfgs', 'liblinear']
        }
    else:  # random_forest
        model = RandomForestClassifier(random_state=42)
        param_grid = {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [None, 10, 20],
            'classifier__min_samples_split': [2, 5]
        }
    
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Perform grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring='f1',
        n_jobs=-1
    )
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Fit model
    grid_search.fit(X_train, y_train)
    
    # Print results
    print(f"\n{model_type.upper()} MODEL RESULTS")
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation score:", grid_search.best_score_)
    
    # Evaluate on test set
    y_pred = grid_search.predict(X_test)
    print("\nTest Set Performance:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return grid_search, X_test, y_test

# Main execution
if __name__ == "__main__":
    # Load data
    try:
        df = load_data('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
    except FileNotFoundError:
        print("Error: Dataset file not found. Please ensure 'Telco-Customer-Churn.csv' is in the working directory.")
        exit(1)
    
    # Prepare features and target
    X, y = prepare_features_target(df)
    
    # Train and evaluate both models
    logistic_model, X_test, y_test = train_pipeline(X, y, 'logistic')
    rf_model, _, _ = train_pipeline(X, y, 'random_forest')
    
    # Select best model based on cross-validation score
    best_model = logistic_model if logistic_model.best_score_ > rf_model.best_score_ else rf_model
    model_name = 'Logistic Regression' if logistic_model.best_score_ > rf_model.best_score_ else 'Random Forest'
    
    print(f"\nBest model: {model_name} with score: {best_model.best_score_}")
    
    # Export the best model
    joblib.dump(best_model, 'churn_prediction_model.joblib')
    print("\nBest model exported to 'churn_prediction_model.joblib'")
    
    # Example of loading and using the model
    loaded_model = joblib.load('churn_prediction_model.joblib')
    sample_prediction = loaded_model.predict(X_test.iloc[:5])
    print("\nSample predictions for first 5 test instances:", sample_prediction)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/prepro


LOGISTIC MODEL RESULTS
Best parameters: {'classifier__C': 1, 'classifier__solver': 'liblinear'}
Best cross-validation score: 0.5903417368483898

Test Set Performance:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.68      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.81      0.82      0.82      1409


Confusion Matrix:
[[929 107]
 [148 225]]
