In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib 

In [2]:
df = pd.read_csv('final_telco.csv')
df.head()

Unnamed: 0,Customer ID,Month,Month of Joining,zip_code,Gender,Age,Married,Dependents,Number of Dependents,Location ID,...,Streaming Movies,Streaming Music,Unlimited Data,Payment Method,Status ID,Satisfaction Score,Churn Category,Churn Reason,Customer Status,Churn Value
0,hthjctifkiudi0,1,1,71638,Female,36.0,No,No,0.0,jeavwsrtakgq0,...,Yes,Yes,Yes,Credit Card,vvhwtmkbxtvsppd52013,3,Competitor,Competitor offered higher download speeds,Churned,1
1,uqdtniwvxqzeu1,6,6,72566,Male,36.472065,No,No,0.0,qcvetdmalnkw1,...,No,No,No,Bank Withdrawal,jucxaluihiluj82863,4,Not Applicable,Not Applicable,Stayed,0
2,uqdtniwvxqzeu1,7,6,72566,Male,36.442687,No,No,0.0,qcvetdmalnkw1,...,No,No,Yes,Credit Card,vjskkxphumfai57182,3,Not Applicable,Not Applicable,Stayed,0
3,uqdtniwvxqzeu1,8,6,72566,Male,36.837888,No,No,0.0,qcvetdmalnkw1,...,No,No,Yes,Wallet Balance,cdwbcrvylqca53109,4,Not Applicable,Not Applicable,Stayed,0
4,uqdtniwvxqzeu1,9,6,72566,Male,36.490214,No,No,0.0,qcvetdmalnkw1,...,Yes,No,Yes,Credit Card,whqrmeulitfj98550,1,Not Applicable,Not Applicable,Stayed,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653753 entries, 0 to 653752
Data columns (total 74 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Customer ID                 653753 non-null  object 
 1   Month                       653753 non-null  int64  
 2   Month of Joining            653753 non-null  int64  
 3   zip_code                    653753 non-null  int64  
 4   Gender                      653753 non-null  object 
 5   Age                         653753 non-null  float64
 6   Married                     653753 non-null  object 
 7   Dependents                  653753 non-null  object 
 8   Number of Dependents        653753 non-null  float64
 9   Location ID                 653753 non-null  object 
 10  Service ID                  653753 non-null  object 
 11  state                       653753 non-null  object 
 12  county                      653753 non-null  object 
 13  timezone      

In [4]:
df.isnull().sum()

Customer ID           0
Month                 0
Month of Joining      0
zip_code              0
Gender                0
                     ..
Satisfaction Score    0
Churn Category        0
Churn Reason          0
Customer Status       0
Churn Value           0
Length: 74, dtype: int64

In [5]:
print("Initial Shape:", df.shape)

Initial Shape: (653753, 74)


In [6]:
drop_cols = ['Customer ID', 'zip_code', 'Location ID', 'Service ID', 'Status ID',
             'state', 'county', 'timezone', 'area_codes', 'country',
             'Churn Category', 'Churn Reason', 'Customer Status']

df.drop(columns=drop_cols, inplace=True)
print("After Dropping:", df.shape)

After Dropping: (653753, 61)


In [7]:
# Convert object-based ARPU columns to numeric
df['arpu_4g'] = pd.to_numeric(df['arpu_4g'], errors='coerce')
df['arpu_5g'] = pd.to_numeric(df['arpu_5g'], errors='coerce')

# Fill nulls
df.fillna({
    'total_rech_data': 0,
    'night_pck_user': 0,
    'fb_user': 0,
    'Internet Type': 'Unknown',
    'Multiple Lines': 'No',
    'Unlimited Data': 'No',
}, inplace=True)

df.dropna(inplace=True)  # drop remaining rows with missing values

In [8]:
cat_cols = df.select_dtypes(include='object').columns
encoder = LabelEncoder()

for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

In [9]:
X = df.drop('Churn Value', axis=1)
y = df['Churn Value']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9567246361009649
[[87006   180]
 [ 3789   740]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     87186
           1       0.80      0.16      0.27      4529

    accuracy                           0.96     91715
   macro avg       0.88      0.58      0.62     91715
weighted avg       0.95      0.96      0.94     91715



In [13]:
joblib.dump(model, 'churn_model.pkl')
joblib.dump(encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [14]:
def feedback_loop(new_data, actual_value):
    """
    Add new record to training set and retrain model (simulated loop).
    """
    global X_train, y_train, model

    # Encode new data
    for col in cat_cols:
        if col in new_data:
            new_data[col] = encoder.transform([new_data[col]])[0]

    new_df = pd.DataFrame([new_data])
    new_X = new_df.drop('Churn Value', axis=1)
    new_y = new_df['Churn Value']

    # Append and retrain
    X_train = pd.concat([X_train, new_X])
    y_train = pd.concat([y_train, new_y])

    model.fit(X_train, y_train)
    joblib.dump(model, 'churn_model.pkl')

- Checking if model works perfectly

In [16]:
joblib.load('churn_model.pkl')

In [17]:
joblib.load('label_encoder.pkl')

In [18]:
def feedback_loop(new_data, true_label, file_path='feedback_data.csv'):
    new_data['Churn Value'] = true_label
    if os.path.exists(file_path):
        existing = pd.read_csv(file_path)
        updated = pd.concat([existing, pd.DataFrame([new_data])], ignore_index=True)
    else:
        updated = pd.DataFrame([new_data])
    updated.to_csv(file_path, index=False)
    print("Feedback recorded successfully.")

In [19]:
test_input = {
    'Gender': 'Male',
    'Age': 32,
    'Married': 'Yes',
    'Number of Dependents': 2,
    'state': 'Maharashtra',
    'county': 'Pune',
    'area_codes': '412345',
    'roam_ic': 12.5,
    'roam_og': 8.2,
    'loc_og_t2m': 45.6,
    'std_og_t2m': 30.4,
    'isd_og': 5.2,
    'total_rech_amt': 450,
    'total_rech_data': 1.5,
    'vol_4g': 2.3,
    'vol_5g': 0.0,
    'arpu': 300.5,
    'night_pck_user': 1.0,
    'fb_user': 1.0,
    'Internet Service': 'Fiber optic',
    'Online Security': 'Yes',
    'Online Backup': 'No',
    'Device Protection Plan': 'Yes',
    'Premium Tech Support': 'No',
    'Streaming TV': 'Yes',
    'Streaming Movies': 'Yes',
    'Streaming Music': 'Yes',
    'Unlimited Data': 'Yes',
    'Payment Method': 'Credit Card',
    'Satisfaction Score': 3,
}
feedback_loop(test_input, true_label=1)  # 1 means "churned"
pd.read_csv('feedback_data.csv').tail()

Feedback recorded successfully.


Unnamed: 0,Gender,Age,Married,Number of Dependents,state,county,area_codes,roam_ic,roam_og,loc_og_t2m,...,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Payment Method,Satisfaction Score,Churn Value
0,Male,32,Yes,2,Maharashtra,Pune,412345,12.5,8.2,45.6,...,No,Yes,No,Yes,Yes,Yes,Yes,Credit Card,3,1


In [20]:
df2 = pd.read_csv('feedback_data.csv')
df2 = df2.iloc[:-1]
df2.to_csv('feedback_data.csv', index=False)
print("Last entry removed successfully.")

Last entry removed successfully.


- Trying SMOTE(Synthetic Minority Oversampling Technique) for class balancing as the churners has low accuracy

In [22]:
print(df.columns.tolist())


['Month', 'Month of Joining', 'Gender', 'Age', 'Married', 'Dependents', 'Number of Dependents', 'latitude', 'longitude', 'roam_ic', 'roam_og', 'loc_og_t2t', 'loc_og_t2m', 'loc_og_t2f', 'loc_og_t2c', 'std_og_t2t', 'std_og_t2m', 'std_og_t2f', 'std_og_t2c', 'isd_og', 'spl_og', 'og_others', 'loc_ic_t2t', 'loc_ic_t2m', 'loc_ic_t2f', 'std_ic_t2t', 'std_ic_t2m', 'std_ic_t2f', 'std_ic_t2o', 'spl_ic', 'isd_ic', 'ic_others', 'total_rech_amt', 'total_rech_data', 'vol_4g', 'vol_5g', 'arpu_5g', 'arpu_4g', 'arpu', 'night_pck_user', 'fb_user', 'aug_vbc_5g', 'offer', 'Referred a Friend', 'Number of Referrals', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Streaming Data Consumption', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Payment Method', 'Satisfaction Score', 'Churn Value']


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd
import pickle

#Select meaningful features based on your column names
features = [
    'Gender', 'Age', 'Married', 'Number of Dependents',
    'roam_ic', 'roam_og', 'loc_og_t2m', 'std_og_t2m',
    'total_rech_amt', 'vol_4g', 'vol_5g', 'arpu',
    'night_pck_user', 'fb_user', 'Online Backup',
    'Device Protection Plan', 'Premium Tech Support',
    'Streaming TV', 'Streaming Movies', 'Streaming Music',
    'Unlimited Data', 'Payment Method', 'Satisfaction Score'
]
target = 'Churn Value'

#Drop rows with missing values in selected columns
df_selected = df[features + [target]].dropna()

#Encode categorical features
cat_cols = df_selected.select_dtypes(include='object').columns
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_selected[col] = le.fit_transform(df_selected[col])
    encoders[col] = le
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(encoders, f)
X = df_selected.drop(target, axis=1)
y = df_selected[target]
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42)

print("Preprocessing, encoding, and SMOTE completed.")

Preprocessing, encoding, and SMOTE completed.


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(model, f)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)

Accuracy: 0.9560764705207748
Confusion Matrix:
 [[83701  3424]
 [ 4236 83033]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96     87125
           1       0.96      0.95      0.96     87269

    accuracy                           0.96    174394
   macro avg       0.96      0.96      0.96    174394
weighted avg       0.96      0.96      0.96    174394



In [46]:
def retrain_model_from_feedback(original_data_path='final_telco.csv',
                                 feedback_data_path='feedback_data.csv',
                                 model_output_path='model.pkl',
                                 encoder_path='label_encoder.pkl'):
    # Load original data
    original = pd.read_csv(original_data_path)

    # Load feedback data if available
    if os.path.exists(feedback_data_path):
        feedback = pd.read_csv(feedback_data_path)

        # Ignore completely empty feedback
        if feedback.empty or feedback.dropna(how='all').empty:
            print("Feedback file is empty. Using only original data.")
            combined = original
        else:
            combined = pd.concat([original, feedback], ignore_index=True)
    else:
        print("Feedback file not found. Using only original data.")
        combined = original

    # Standardize target column name
    if "Churn" in combined.columns:
        target = "Churn"
    elif "Churn Value" in combined.columns:
        combined.rename(columns={"Churn Value": "Churn"}, inplace=True)
        target = "Churn"
    else:
        raise ValueError("Target column not found. Expected 'Churn' or 'Churn Value'.")

    # Drop rows with missing target values
    combined = combined.dropna(subset=[target])

    # Ensure target is integer
    combined[target] = combined[target].astype(int)

    # Separate features and labels
    X = combined.drop(columns=[target])
    y = combined[target]

    # Label encode categorical columns and handle missing values
    cat_cols = X.select_dtypes(include='object').columns
    encoders = {}
    
    for col in cat_cols:
        le = LabelEncoder()
        X[col] = X[col].fillna("Missing")
        X[col] = le.fit_transform(X[col].astype(str))
        encoders[col] = le

    # Fill missing numerical values with column mean
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns
    X[num_cols] = X[num_cols].fillna(X[num_cols].mean())


    # Save encoders
    joblib.dump(encoders, encoder_path)

    # Balance the data using SMOTE
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)

    # Train new model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_res, y_res)
    # Save updated model
    joblib.dump(model, model_output_path)
    print("Model retrained and saved successfully.")


#### "retrain_model_from_feedback()" just call this once the feedback is collected
- it will:
1. Combine main data with feedback_data.csv
2. Re-encode it
3. Apply SMOTE and Train a fresh model
4. Save 'model.pkl' and updated encoders

In [None]:
retrain_model_from_feedback()

In [44]:
pd.read_csv("feedback_data.csv")


Unnamed: 0,Gender,Age,Married,Number of Dependents,state,county,area_codes,roam_ic,roam_og,loc_og_t2m,...,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Payment Method,Satisfaction Score,Churn Value
0,Male,25,1,0,Maharashtra,Pune,23,12.3,9.5,25.6,...,,,,,,,,,,
1,Female,29,0,2,Maharashtra,Goa,23,12.3,9.5,25.6,...,,,,,,,,,,
2,Male,25,1,0,Madhya Pradesh,Indore,4201,20.5,15.0,50.2,...,,,,,,,,,,
