In [None]:
!pip install pandas scikit-learn xgboost




In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# Load the dataset
file_path = '/content/drive/My Drive/ChurnProject/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(file_path)

# Display initial preview
print(df.head())
print("\nDataset Shape:", df.shape)

Mounted at /content/drive
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingM

In [None]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [None]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
print("\nData Types After Conversion:")
print(df.dtypes)


Data Types After Conversion:
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object


In [None]:
print("\nNumber of Duplicates:", df.duplicated().sum())
df = df.drop_duplicates()
print("New Shape After Removing Duplicates:", df.shape)


Number of Duplicates: 0
New Shape After Removing Duplicates: (7043, 21)


In [None]:
df['TenureGroup'] = pd.cut(df['tenure'], bins=[0, 12, 24, float('inf')], labels=['0-12', '13-24', '25+'])

In [None]:
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                   'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TenureGroup']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
print("\nUpdated Dataset Preview:")
print(df.head())
print("\nUpdated Dataset Shape:", df.shape)


Updated Dataset Preview:
   customerID  SeniorCitizen  tenure  MonthlyCharges  TotalCharges  Churn  \
0  7590-VHVEG              0       1           29.85         29.85      0   
1  5575-GNVDE              0      34           56.95       1889.50      0   
2  3668-QPYBK              0       2           53.85        108.15      1   
3  7795-CFOCW              0      45           42.30       1840.75      0   
4  9237-HQITU              0       2           70.70        151.65      1   

   gender_Male  Partner_Yes  Dependents_Yes  PhoneService_Yes  ...  \
0        False         True           False             False  ...   
1         True        False           False              True  ...   
2         True        False           False              True  ...   
3         True        False           False             False  ...   
4        False        False           False              True  ...   

   StreamingMovies_No internet service  StreamingMovies_Yes  \
0                          

In [None]:
cleaned_file_path = '/content/drive/My Drive/ChurnProject/cleaned_telco_churn.csv'
df.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved to {cleaned_file_path}")


Cleaned dataset saved to /content/drive/My Drive/ChurnProject/cleaned_telco_churn.csv


In [None]:
import pandas as pd



# Load the uploaded file (replace with your actual filename)
df = pd.read_csv('/content/drive/MyDrive/ChurnProject/cleaned_telco_churn.csv')

# Preview the data
df.head()


Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,TenureGroup_13-24,TenureGroup_25+
0,7590-VHVEG,0,1,29.85,29.85,0,False,True,False,False,...,False,False,False,False,True,False,True,False,False,False
1,5575-GNVDE,0,34,56.95,1889.5,0,True,False,False,True,...,False,False,True,False,False,False,False,True,False,True
2,3668-QPYBK,0,2,53.85,108.15,1,True,False,False,True,...,False,False,False,False,True,False,False,True,False,False
3,7795-CFOCW,0,45,42.3,1840.75,0,True,False,False,False,...,False,False,True,False,False,False,False,False,False,True
4,9237-HQITU,0,2,70.7,151.65,1,False,False,False,True,...,False,False,False,False,True,False,True,False,False,False


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset (assuming already loaded as df from Step 9)
# df = pd.read_csv('/content/drive/My Drive/ChurnProject/cleaned_telco_churn.csv')

# Remove rows with missing Churn values first
df = df.dropna(subset=['Churn'])

# Label encode the target
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Separate features and target
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

# Identify numerical and categorical columns
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = [col for col in X.columns if col not in numeric_cols]

# One-hot encode categorical columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Scale only numerical columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (5634, 32)
X_test shape: (1409, 32)
y_train shape: (5634,)
y_test shape: (1409,)


In [None]:
import pandas as pd

# Reload the original dataset to check
file_path = '/content/drive/My Drive/ChurnProject/cleaned_telco_churn.csv'
df = pd.read_csv(file_path)

# Check for missing values in Churn
print("Missing values in Churn:", df['Churn'].isnull().sum())
print("Total rows:", df.shape[0])
print("Preview of Churn column:")
print(df['Churn'].head())

Missing values in Churn: 0
Total rows: 7043
Preview of Churn column:
0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the confirmed dataset
file_path = '/content/drive/My Drive/ChurnProject/cleaned_telco_churn.csv'
df = pd.read_csv(file_path)

# Verify no missing Churn values (already confirmed, but double-check)
print("Missing values in Churn:", df['Churn'].isnull().sum())
print("Total rows:", df.shape[0])

# Separate features and target
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn']

# Identify numerical and categorical columns
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = [col for col in X.columns if col not in numeric_cols]

# One-hot encode categorical columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Scale only numerical columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify shapes and no NaN in y
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("NaN values in y_train:", y_train.isnull().sum())
print("NaN values in y_test:", y_test.isnull().sum())


Missing values in Churn: 0
Total rows: 7043
X_train shape: (5634, 32)
X_test shape: (1409, 32)
y_train shape: (5634,)
y_test shape: (1409,)
NaN values in y_train: 0
NaN values in y_test: 0


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Initialize models
log_reg = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier(random_state=42)
xgb_clf = XGBClassifier(random_state=42)

# Train the models
log_reg.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

# Print training status
print("Models trained successfully!")

Models trained successfully!


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import numpy as np

# Make predictions
y_pred_log_reg = log_reg.predict(X_test)
y_pred_rf = rf_clf.predict(X_test)
y_pred_xgb = xgb_clf.predict(X_test)

# Calculate metrics for each model
metrics = {
    'Logistic Regression': [accuracy_score(y_test, y_pred_log_reg), precision_score(y_test, y_pred_log_reg), recall_score(y_test, y_pred_log_reg)],
    'Random Forest': [accuracy_score(y_test, y_pred_rf), precision_score(y_test, y_pred_rf), recall_score(y_test, y_pred_rf)],
    'XGBoost': [accuracy_score(y_test, y_pred_xgb), precision_score(y_test, y_pred_xgb), recall_score(y_test, y_pred_xgb)]
}

# Print metrics
for model, scores in metrics.items():
    print(f"{model}:")
    print(f"  Accuracy: {scores[0]:.2f}")
    print(f"  Precision: {scores[1]:.2f}")
    print(f"  Recall: {scores[2]:.2f}")

# Print confusion matrices
print("\nConfusion Matrices:")
print("Logistic Regression:\n", confusion_matrix(y_test, y_pred_log_reg))
print("Random Forest:\n", confusion_matrix(y_test, y_pred_rf))
print("XGBoost:\n", confusion_matrix(y_test, y_pred_xgb))

Logistic Regression:
  Accuracy: 0.82
  Precision: 0.69
  Recall: 0.57
Random Forest:
  Accuracy: 0.79
  Precision: 0.64
  Recall: 0.46
XGBoost:
  Accuracy: 0.79
  Precision: 0.63
  Recall: 0.52

Confusion Matrices:
Logistic Regression:
 [[939  97]
 [162 211]]
Random Forest:
 [[941  95]
 [203 170]]
XGBoost:
 [[924 112]
 [179 194]]


In [None]:
# Generate probabilities for the test set
y_prob_log_reg = log_reg.predict_proba(X_test)[:, 1]  # Probability of churn (class 1)
y_prob_rf = rf_clf.predict_proba(X_test)[:, 1]
y_prob_xgb = xgb_clf.predict_proba(X_test)[:, 1]

# Combine with test data for analysis
test_results = pd.DataFrame({
    'tenure': X_test['tenure'],
    'MonthlyCharges': X_test['MonthlyCharges'],
    'Churn': y_test,
    'LogReg_Prob': y_prob_log_reg,
    'RF_Prob': y_prob_rf,
    'XGB_Prob': y_prob_xgb
})

# Save to CSV for Power BI
test_results_file = '/content/drive/My Drive/ChurnProject/test_results_with_probs.csv'
test_results.to_csv(test_results_file, index=False)
print(f"Test results with probabilities saved to {test_results_file}")

Test results with probabilities saved to /content/drive/My Drive/ChurnProject/test_results_with_probs.csv
