In [1]:
import zipfile
import os

# Define the path to the uploaded zip file
zip_path = "/content/archive (45).zip"
extract_dir = "/mnt/data/extracted"

# Extract the contents of the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# List the extracted files
extracted_files = os.listdir(extract_dir)
extracted_files


['Telco_customer_churn.xlsx']

In [2]:
import pandas as pd

# Load the Excel file
file_path = os.path.join(extract_dir, 'Telco_customer_churn.xlsx')
df = pd.read_excel(file_path)

# Show basic info and first few rows
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         7043 non-null   object 
 1   Count              7043 non-null   int64  
 2   Country            7043 non-null   object 
 3   State              7043 non-null   object 
 4   City               7043 non-null   object 
 5   Zip Code           7043 non-null   int64  
 6   Lat Long           7043 non-null   object 
 7   Latitude           7043 non-null   float64
 8   Longitude          7043 non-null   float64
 9   Gender             7043 non-null   object 
 10  Senior Citizen     7043 non-null   object 
 11  Partner            7043 non-null   object 
 12  Dependents         7043 non-null   object 
 13  Tenure Months      7043 non-null   int64  
 14  Phone Service      7043 non-null   object 
 15  Multiple Lines     7043 non-null   object 
 16  Internet Service   7043 

(None,
    CustomerID  Count        Country       State         City  Zip Code  \
 0  3668-QPYBK      1  United States  California  Los Angeles     90003   
 1  9237-HQITU      1  United States  California  Los Angeles     90005   
 2  9305-CDSKC      1  United States  California  Los Angeles     90006   
 3  7892-POOKP      1  United States  California  Los Angeles     90010   
 4  0280-XJGEX      1  United States  California  Los Angeles     90015   
 
                  Lat Long   Latitude   Longitude  Gender  ...        Contract  \
 0  33.964131, -118.272783  33.964131 -118.272783    Male  ...  Month-to-month   
 1   34.059281, -118.30742  34.059281 -118.307420  Female  ...  Month-to-month   
 2  34.048013, -118.293953  34.048013 -118.293953  Female  ...  Month-to-month   
 3  34.062125, -118.315709  34.062125 -118.315709  Female  ...  Month-to-month   
 4  34.039224, -118.266293  34.039224 -118.266293    Male  ...  Month-to-month   
 
   Paperless Billing             Payment Method

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Drop irrelevant columns
df_cleaned = df.drop(columns=[
    'CustomerID', 'Lat Long', 'City', 'State', 'Country', 'Churn Reason'
])

# Convert 'Total Charges' to numeric, coerce errors to NaN
df_cleaned['Total Charges'] = pd.to_numeric(df_cleaned['Total Charges'], errors='coerce')

# Fill missing Total Charges with median
df_cleaned['Total Charges'].fillna(df_cleaned['Total Charges'].median(), inplace=True)

# Define target and features
target = 'Churn Value'
X = df_cleaned.drop(columns=[target, 'Churn Label'])  # Drop redundant churn label
y = df_cleaned[target]

# Identify categorical columns
binary_cols = [col for col in X.select_dtypes(include='object').columns if X[col].nunique() == 2]
multi_cols = [col for col in X.select_dtypes(include='object').columns if X[col].nunique() > 2]
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Column transformer: LabelEncode binary, OneHot multi-class, scale numeric
preprocessor = ColumnTransformer(transformers=[
    ('bin', Pipeline(steps=[('le', LabelEncoder())]), binary_cols),  # Will handle separately
    ('onehot', OneHotEncoder(drop='first', sparse_output=False), multi_cols),
    ('scale', StandardScaler(), num_cols)
], remainder='drop')

# Manually encode binary columns since LabelEncoder doesn't work inside ColumnTransformer
le = LabelEncoder()
for col in binary_cols:
    X[col] = le.fit_transform(X[col])

# Apply OneHot and Scaling to the rest
onehot_scale_preprocessor = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False), multi_cols),
    ('scale', StandardScaler(), num_cols)
])

X_processed = onehot_scale_preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Feature importance
importances = rf.feature_importances_

# Save model
model_path = "/mnt/data/random_forest_churn_model.joblib"
joblib.dump(rf, model_path)

# Output classification report and importance length
report = classification_report(y_test, y_pred)
importance_length = len(importances)
report, importance_length, model_path


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['Total Charges'].fillna(df_cleaned['Total Charges'].median(), inplace=True)


('              precision    recall  f1-score   support\n\n           0       0.94      0.96      0.95      1009\n           1       0.89      0.84      0.87       400\n\n    accuracy                           0.93      1409\n   macro avg       0.92      0.90      0.91      1409\nweighted avg       0.93      0.93      0.93      1409\n',
 30,
 '/mnt/data/random_forest_churn_model.joblib')

In [10]:
# Redefine preprocessor using compatible argument for OneHotEncoder
onehot_scale_preprocessor = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False), multi_cols),
    ('scale', StandardScaler(), num_cols)
])

# Reapply LabelEncoding to binary columns
for col in binary_cols:
    X[col] = le.fit_transform(X[col])

# Transform the data
X_processed = onehot_scale_preprocessor.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Get feature importances and classification report
importances = rf.feature_importances_
report = classification_report(y_test, y_pred)

# Save model to disk
model_path = "/mnt/data/random_forest_churn_model.joblib"
joblib.dump(rf, model_path)

report, len(importances), model_path

('              precision    recall  f1-score   support\n\n           0       0.94      0.96      0.95      1009\n           1       0.89      0.84      0.87       400\n\n    accuracy                           0.93      1409\n   macro avg       0.92      0.90      0.91      1409\nweighted avg       0.93      0.93      0.93      1409\n',
 30,
 '/mnt/data/random_forest_churn_model.joblib')