In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier



In [2]:
# Load the dataset
file_path = 'Telco-Customer-Churn.csv'
data = pd.read_csv(file_path)

In [3]:
telco_data = data.copy()

In [4]:
telco_data.TotalCharges = pd.to_numeric(telco_data.TotalCharges, errors='coerce')
telco_data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [5]:
telco_data.loc[telco_data ['TotalCharges'].isnull() == True]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [6]:
#Removing missing values 
telco_data.dropna(how = 'any', inplace = True)

#telco_data.fillna(0)

In [7]:
# Preserve customerID for the output
customer_ids = telco_data['customerID']

In [8]:
# Drop customerID as it's not a relevant feature
telco_data.drop(columns=['customerID'], inplace=True)

In [9]:
# Encode categorical variables
label_encoders = {}
for column in telco_data.select_dtypes(include=['object']).columns:
    if column != 'Churn':
        le = LabelEncoder()
        telco_data[column] = le.fit_transform(telco_data[column])
        label_encoders[column] = le


In [10]:
# Encode target variable
le_churn = LabelEncoder()
telco_data['Churn'] = le_churn.fit_transform(telco_data['Churn'])

In [11]:
# Split the data into training and test sets
X = telco_data.drop(columns=['Churn'])
y = telco_data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Initialize and train the k-NN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [13]:
# Predict probabilities
y_pred_proba = knn_model.predict_proba(X)[:, 1]

In [14]:
# Create a new column in the dataset with these probabilities
telco_data['churnprobability'] = y_pred_proba * 100

In [15]:
telco_data['customerID'] = customer_ids

In [16]:
telco_data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,churnprobability,customerID
0,0,0,1,0,1,0,1,0,0,2,...,0,0,0,1,2,29.85,29.85,0,60.0,7590-VHVEG
1,1,0,0,0,34,1,0,0,2,0,...,0,0,1,0,3,56.95,1889.5,0,0.0,5575-GNVDE
2,1,0,0,0,2,1,0,0,2,2,...,0,0,0,1,3,53.85,108.15,1,40.0,3668-QPYBK
3,1,0,0,0,45,0,1,0,2,0,...,0,0,1,0,0,42.3,1840.75,0,0.0,7795-CFOCW
4,0,0,0,0,2,1,0,1,0,0,...,0,0,0,1,2,70.7,151.65,1,60.0,9237-HQITU


In [17]:
telco_data.to_csv('Telco-Customer-Churn-with-Probabilities-KNN.csv', index=False)