In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
 


In [22]:
from sklearn.preprocessing import FunctionTransformer


In [9]:
pip install openpyxl


Defaulting to user installation because normal site-packages is not writeable
Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   ---------------------------------------- 2/2 [openpyxl]

Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [10]:
import openpyxl
print(openpyxl.__version__)


3.1.5


In [41]:
file_path = "Telco_customer_churn.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

In [11]:
import pandas as pd
df = pd.read_excel("Telco_customer_churn.xlsx")
print(df.head())


   CustomerID  Count        Country       State         City  Zip Code  \
0  3668-QPYBK      1  United States  California  Los Angeles     90003   
1  9237-HQITU      1  United States  California  Los Angeles     90005   
2  9305-CDSKC      1  United States  California  Los Angeles     90006   
3  7892-POOKP      1  United States  California  Los Angeles     90010   
4  0280-XJGEX      1  United States  California  Los Angeles     90015   

                 Lat Long   Latitude   Longitude  Gender  ...        Contract  \
0  33.964131, -118.272783  33.964131 -118.272783    Male  ...  Month-to-month   
1   34.059281, -118.30742  34.059281 -118.307420  Female  ...  Month-to-month   
2  34.048013, -118.293953  34.048013 -118.293953  Female  ...  Month-to-month   
3  34.062125, -118.315709  34.062125 -118.315709  Female  ...  Month-to-month   
4  34.039224, -118.266293  34.039224 -118.266293    Male  ...  Month-to-month   

  Paperless Billing             Payment Method  Monthly Charges Tota

In [42]:
# 2. Clean column names
df.columns = df.columns.str.strip().str.lower()

In [43]:
# 3. Target variable → churn label
if "churn label" not in df.columns:
    raise ValueError(f"'churn label' not found. Available: {df.columns.tolist()}")

df["churn"] = df["churn label"].map({"Yes": 1, "No": 0})
y = df["churn"]

In [44]:
# Drop churn-related/leakage columns
drop_cols = ["churn label", "churn value", "churn score", "cltv", "churn reason"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns])


In [45]:
# 4. Fix TotalCharges (numeric)
if "total charges" in X.columns:
    X["total charges"] = pd.to_numeric(X["total charges"], errors="coerce")

In [46]:
# 5. Define numeric & categorical columns

numeric_columns = ["tenure months", "monthly charges", "total charges"]
categorical_columns = [col for col in X.columns if col not in numeric_columns]


In [47]:
# 6. Preprocessing
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns)
    ]
)


In [48]:
# 7. Build pipeline
pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))
])


In [49]:
# 8. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [50]:
# 9. Hyperparameter Tuning
param_grid = {
    "clf": [RandomForestClassifier(random_state=42)],
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [None, 10, 20]
}

search = GridSearchCV(pipe, param_grid, cv=3, scoring="accuracy", n_jobs=-1, error_score="raise")
search.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'clf': [RandomForestC...ndom_state=42)], 'clf__max_depth': [None, 10, ...], 'clf__n_estimators': [100, 200]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,'raise'
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [51]:
# 10. Evaluate
y_pred = search.predict(X_test)
print("Best Params:", search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Best Params: {'clf': RandomForestClassifier(random_state=42), 'clf__max_depth': None, 'clf__n_estimators': 200}
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1035
           1       1.00      1.00      1.00       374

    accuracy                           1.00      1409
   macro avg       1.00      1.00      1.00      1409
weighted avg       1.00      1.00      1.00      1409



In [53]:
#11. Save final model
joblib.dump(search.best_estimator_, "telco_churn_pipeline.pkl")
print(" Model saved as telco_churn_pipeline.pkl")


 Model saved as telco_churn_pipeline.pkl
