In [23]:
import pandas as pd

In [24]:
df1 = pd.read_csv("/content/customer_churn_dataset-training-master.csv")

In [25]:
# make the sample of 4000 data from the df1 into the df
df = df1.sample(n=4000)

In [26]:
df.head(1)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
212168,217506.0,32.0,Male,27.0,24.0,9.0,26.0,Standard,Annual,602.64,30.0,1.0


In [27]:
df.dropna(axis=0 , inplace=True)

In [28]:
df.shape

(4000, 12)

In [29]:
x = df.drop(['Churn'], axis=1)
y = df['Churn']

In [30]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [31]:
x_train.head(1)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction
11724,11733.0,50.0,Female,1.0,6.0,3.0,12.0,Basic,Annual,715.0,26.0


In [32]:
df['Subscription Type'].unique()

array(['Standard', 'Premium', 'Basic'], dtype=object)

In [33]:
df['Contract Length'].unique()

array(['Annual', 'Quarterly', 'Monthly'], dtype=object)

In [34]:
# import the one hot encoder , ordinal encoder
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder


In [35]:
# import the column transformer and pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [36]:
c1 = ColumnTransformer(transformers=[
    ("one hot encoder for gender" , OneHotEncoder(sparse_output=False , handle_unknown='ignore' , drop='first') , [2])
], remainder='passthrough')

In [37]:
c2 = ColumnTransformer(transformers=[
    ("ordinal encode for Subscription Type " , OrdinalEncoder(categories=[['Basic', 'Standard', 'Premium']]) , [7]),
    ("ordinal encode for Contract Length " , OrdinalEncoder(categories=[['Monthly', 'Annual', 'Quarterly']]) , [8]),
], remainder='passthrough')

In [38]:
# import the standard scaler
from sklearn.preprocessing import StandardScaler
c3 = ColumnTransformer(transformers=[
    ('scaler', StandardScaler(), slice(0, 11))
], remainder='passthrough')


In [39]:
pipe = Pipeline([
    ('column transformer 1', c1),
    ('column transformer 2', c2),
    ('column transformer 3', c3)
])

In [40]:
x_train1 = pipe.fit_transform(x_train)

In [41]:
df2 = pd.DataFrame(x_train1 , columns=x_train.columns)

In [42]:
df2.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction
0,-1.24599,-0.292851,-1.172129,-1.671048,0.883164,-1.759482,-1.121936,-0.195691,-0.127097,0.378535,1.338384
1,-0.022557,-1.64123,-1.172129,-0.967707,-0.893969,-1.643786,1.419713,-0.195691,-0.863222,-0.998595,-1.300733
2,1.200876,1.055528,0.853148,0.781338,0.479271,0.323045,-0.544288,0.131654,-0.740535,-0.142319,-0.497524
3,-1.24599,-0.292851,0.853148,1.733968,0.075377,-0.486827,0.033359,-1.177725,-0.372472,0.976484,-0.038547
4,-1.24599,-0.292851,0.853148,0.81943,-0.490075,-0.139739,-1.237465,-0.523036,0.731716,0.967997,-0.612268


In [44]:
x_test = pipe.transform(x_test)

In [46]:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

for name, model in models.items():
    model.fit(x_train1, y_train)
    y_pred_train = model.predict(x_train1)
    y_pred_test = model.predict(x_test)

    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print(f"{name}:")
    print(f"  Training Accuracy: {train_accuracy}")
    print(f"  Testing Accuracy: {test_accuracy}")

Logistic Regression:
  Training Accuracy: 0.9771875
  Testing Accuracy: 0.97375
Decision Tree:
  Training Accuracy: 1.0
  Testing Accuracy: 0.9875
Random Forest:
  Training Accuracy: 1.0
  Testing Accuracy: 0.99125
SVM:
  Training Accuracy: 0.99375
  Testing Accuracy: 0.99
KNN:
  Training Accuracy: 0.974375
  Testing Accuracy: 0.97875
Naive Bayes:
  Training Accuracy: 0.984375
  Testing Accuracy: 0.9825
