In [4]:
# 1. Imports
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# 2. Load data
df = pd.read_csv("/workspaces/telecom-churn-prediction/data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(df.head())
print(df.info())

# 3. Split features and target
X = df.drop('Churn', axis=1)
y = df['Churn'].map({'Yes':1, 'No':0})  # convert Yes/No to 1/0

# 4. Identify column types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)

# 5. Define preprocessors (use sparse_output instead of sparse)
categorical_processor = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numeric_processor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_processor, categorical_cols),
        ('num', numeric_processor, numeric_cols)
    ],
    remainder='drop'
)

# 6. Define pipeline
pipe = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# 7. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 8. Grid search for best hyperparameters
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20]
}

grid = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)

best_pipe = grid.best_estimator_
print("Best parameters:", grid.best_params_)

# 9. Evaluate
y_pred = best_pipe.predict(X_test)
print("Model performance:\n", classification_report(y_test, y_pred))

# 10. Save the trained pipeline
joblib.dump(best_pipe, "/workspaces/telecom-churn-prediction/best_pipe.pkl")
print("Pipeline saved at /workspaces/telecom-churn-prediction/best_pipe.pkl")


371.31s - Error patching args (debugger not attached to subprocess).
Traceback (most recent call last):
  File "/home/codespace/.local/lib/python3.12/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_monkey.py", line 541, in patch_args
    new_args.append(_get_python_c_args(host, port, code, unquoted_args, SetupHolder.setup))
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.local/lib/python3.12/site-packages/debugpy/_vendored/pydevd/_pydev_bundle/pydev_monkey.py", line 193, in _get_python_c_args
    if "__future__" in code:
       ^^^^^^^^^^^^^^^^^^^^
TypeError: a bytes-like object is required, not 'str'


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


Best parameters: {'clf__max_depth': None, 'clf__n_estimators': 200}
Model performance:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.65      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.74      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409

Pipeline saved at /workspaces/telecom-churn-prediction/best_pipe.pkl


In [5]:
import joblib
import pandas as pd

# Load the trained pipeline
best_pipe = joblib.load("/workspaces/telecom-churn-prediction/best_pipe.pkl")
print("Model loaded successfully!")


Model loaded successfully!


In [6]:
# Load the same dataset again
df = pd.read_csv("/workspaces/telecom-churn-prediction/data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Take a few rows as test examples
sample = df.sample(5, random_state=42)  # 5 random customers
print("Sample data:")
print(sample)

# Drop the target column if present
X_new = sample.drop("Churn", axis=1)

# Predict churn
predictions = best_pipe.predict(X_new)
print("\nPredictions (0 = No Churn, 1 = Churn):")
print(predictions)


Sample data:
      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
185   1024-GUALD  Female              0     Yes         No       1   
2715  0484-JPBRU    Male              0      No         No      41   
3825  3620-EHIMZ  Female              0     Yes        Yes      52   
1807  6910-HADCM  Female              0      No         No       1   
132   8587-XYZSF    Male              0      No         No      67   

     PhoneService     MultipleLines InternetService       OnlineSecurity  ...  \
185            No  No phone service             DSL                   No  ...   
2715          Yes               Yes              No  No internet service  ...   
3825          Yes                No              No  No internet service  ...   
1807          Yes                No     Fiber optic                   No  ...   
132           Yes                No             DSL                   No  ...   

         DeviceProtection          TechSupport          StreamingTV  \
185     

In [1]:
def predict_single_customer(model):
    # Example customer data (replace with actual data if you want to test different cases)
    input_data = {
        'gender': 'Female',
        'SeniorCitizen': 0,
        'Partner': 'Yes',
        'Dependents': 'No',
        'tenure': 12,
        'PhoneService': 'Yes',
        'MultipleLines': 'No',
        'InternetService': 'Fiber optic',
        'OnlineSecurity': 'No',
        'OnlineBackup': 'Yes',
        'DeviceProtection': 'No',
        'TechSupport': 'No',
        'StreamingTV': 'Yes',
        'StreamingMovies': 'Yes',
        'Contract': 'Month-to-month',
        'PaperlessBilling': 'Yes',
        'PaymentMethod': 'Electronic check',
        'MonthlyCharges': 70.35,
        'TotalCharges': 845.5
    }

    # Convert to DataFrame
    new_df = pd.DataFrame([input_data])

    # Make prediction
    prediction = model.predict(new_df)[0]
    print("\nPrediction: Churn" if prediction == 1 else "\nPrediction: No Churn")

# Call the function
predict_single_customer(best_pipe)


NameError: name 'best_pipe' is not defined