In [118]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [119]:
df = pd.read_csv("Telco-Customer-Churn.csv")
print("Original shape:", df.shape)
print("Original columns:", df.columns.tolist())

Original shape: (7043, 21)
Original columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [120]:
df = df.drop('customerID', axis=1)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')
df = df.dropna()

print("After cleaning shape:", df.shape)

After cleaning shape: (7032, 20)


In [126]:
print("Column analysis:")
for col in df.columns:
    if df[col].dtype == 'object':
        unique_count = df[col].nunique()
        print(f"{col}: {unique_count} unique values - {df[col].unique()}")
    else:
        print(f"{col}: {df[col].dtype} - Range: {df[col].min()} to {df[col].max()}")


Column analysis:
gender: 2 unique values - ['Female' 'Male']
SeniorCitizen: int64 - Range: 0 to 1
Partner: 2 unique values - ['Yes' 'No']
Dependents: 2 unique values - ['No' 'Yes']
tenure: int64 - Range: 1 to 72
PhoneService: 2 unique values - ['No' 'Yes']
MultipleLines: 3 unique values - ['No phone service' 'No' 'Yes']
InternetService: 3 unique values - ['DSL' 'Fiber optic' 'No']
OnlineSecurity: 3 unique values - ['No' 'Yes' 'No internet service']
OnlineBackup: 3 unique values - ['Yes' 'No' 'No internet service']
DeviceProtection: 3 unique values - ['No' 'Yes' 'No internet service']
TechSupport: 3 unique values - ['No' 'Yes' 'No internet service']
StreamingTV: 3 unique values - ['No' 'Yes' 'No internet service']
StreamingMovies: 3 unique values - ['No' 'Yes' 'No internet service']
Contract: 3 unique values - ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: 2 unique values - ['Yes' 'No']
PaymentMethod: 4 unique values - ['Electronic check' 'Mailed check' 'Bank transfer (autom

In [122]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
print(f"Final shape after encoding: {df.shape}")
print(f"Final columns count: {len(df.columns)}")

Final shape after encoding: (7032, 20)
Final columns count: 20


In [123]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=["Churn"]), df["Churn"], test_size=0.2)

In [124]:
forest_model = RandomForestClassifier(random_state=42, max_depth=9, n_estimators=95)
X_train = pd.get_dummies(x_train)
X_test  = pd.get_dummies(x_test).reindex(columns=X_train.columns, fill_value=0)
forest_model.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy*100:.2f}%")

Model accuracy: 80.88%


In [125]:
logistic_model = LogisticRegression(random_state=42, max_iter=10000)
X_train = pd.get_dummies(x_train)
X_test  = pd.get_dummies(x_test).reindex(columns=X_train.columns, fill_value=0)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy*100:.2f}%")

Model accuracy: 81.38%
