In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel(r"D:\Ultimate Programming\Data Bases\Machine Learning Datasets\Classification\Customer Churn Prediction.xlsx")

In [3]:
df.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [4]:
df.drop(columns=['customerID', 'MultipleLines', 'Contract'], axis=1, inplace=True)

In [5]:
df.isnull().sum().sum()

np.int64(11)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   InternetService   7043 non-null   object 
 7   OnlineSecurity    7043 non-null   object 
 8   OnlineBackup      7043 non-null   object 
 9   DeviceProtection  7043 non-null   object 
 10  TechSupport       7043 non-null   object 
 11  StreamingTV       7043 non-null   object 
 12  StreamingMovies   7043 non-null   object 
 13  PaperlessBilling  7043 non-null   object 
 14  PaymentMethod     7043 non-null   object 
 15  MonthlyCharges    7043 non-null   float64
 16  TotalCharges      7032 non-null   float64


In [7]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline

In [8]:
df.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,DSL,No,Yes,No,No,No,No,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,DSL,Yes,No,Yes,No,No,No,No,Mailed check,56.95,1889.5,No


In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df['PaperlessBilling'].value_counts()

PaperlessBilling
Yes    4161
No     2859
Name: count, dtype: int64

In [11]:
ms = LabelEncoder()
x = df.iloc[ : , :-1]
y = ms.fit_transform(df['Churn'])

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [13]:
trf1 = ColumnTransformer(transformers=[
    ('total_charges', SimpleImputer(), [16])
], remainder='passthrough')

In [14]:
trf1.fit_transform(x_train)

array([[654.85, 'Male', 0, ..., 'Yes', 'Mailed check', 20.65],
       [218.55, 'Male', 0, ..., 'No', 'Mailed check', 20.0],
       [1790.15, 'Male', 0, ..., 'No', 'Credit card (automatic)', 25.15],
       ...,
       [1857.3, 'Female', 0, ..., 'Yes', 'Credit card (automatic)',
        53.75],
       [191.1, 'Female', 0, ..., 'No', 'Mailed check', 20.35],
       [457.3, 'Male', 0, ..., 'No', 'Credit card (automatic)', 19.8]],
      dtype=object)

In [15]:
trf2 = ColumnTransformer(transformers=[
    ('gender', OneHotEncoder(drop='first', sparse_output=False), [0]),
    ('partner', OneHotEncoder(drop='first', sparse_output=False), [2]),
    ('dependent', OneHotEncoder(drop='first', sparse_output=False), [3]),
    ('phone_service', OneHotEncoder(drop='first', sparse_output=False), [5]),
    ('internet_srvice', OneHotEncoder(drop='first', sparse_output=False), [6]),
    ('online_secuirity', OneHotEncoder(drop='first', sparse_output=False), [7]),
    ('online_backup', OneHotEncoder(drop='first', sparse_output=False), [8]),
    ('device_protection', OneHotEncoder(drop='first', sparse_output=False), [9]),
    ('tech_support', OneHotEncoder(drop='first', sparse_output=False), [10]),
    ('streming_tv', OneHotEncoder(drop='first', sparse_output=False), [11]),
    ('streming_movie', OneHotEncoder(drop='first', sparse_output=False), [12]),
    ('paperless_bill', OneHotEncoder(drop='first', sparse_output=False), [13]),
    ('payment_method', OneHotEncoder(drop='first', sparse_output=False), [14]),
], remainder='passthrough')

In [16]:
trf2.fit_transform(x_train)

array([[1.00000e+00, 1.00000e+00, 1.00000e+00, ..., 2.90000e+01,
        2.06500e+01, 6.54850e+02],
       [1.00000e+00, 1.00000e+00, 1.00000e+00, ..., 1.30000e+01,
        2.00000e+01, 2.18550e+02],
       [1.00000e+00, 1.00000e+00, 1.00000e+00, ..., 7.00000e+01,
        2.51500e+01, 1.79015e+03],
       ...,
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 3.30000e+01,
        5.37500e+01, 1.85730e+03],
       [0.00000e+00, 1.00000e+00, 1.00000e+00, ..., 9.00000e+00,
        2.03500e+01, 1.91100e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.60000e+01,
        1.98000e+01, 4.57300e+02]])

In [17]:
x_scaled = trf2.fit_transform(x_train)

In [18]:
x_scaled = pd.DataFrame(x_scaled)
x_scaled.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,29.0,20.65,654.85
1,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,13.0,20.0,218.55


In [19]:
print(x_scaled.shape)

(5616, 26)


In [20]:
mms = StandardScaler()

trf3 = ColumnTransformer(transformers=[
    ('scaler', StandardScaler(), slice(0, 26))
])

In [21]:
trf3.fit_transform(x_train)

ValueError: could not convert string to float: 'Male'

In [37]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [38]:
models = {
    'Logistic Regression' : LogisticRegression(),
    'Logistic Regression cv' : LogisticRegressionCV(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Random Forest' : RandomForestClassifier(),
    'Support Vector' : SVC(),
    'Naive Bayes' : BernoulliNB()
}

In [39]:
for name, model in models.items():
    pipeline = Pipeline([
        ('trf1', trf1),
        ('trf2', trf2),
        ('trf3', trf3),
        ('classifier', model)
    ])
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")

TypeError: 'StandardScaler' object is not iterable

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train, y_train)

In [None]:
lr.score(x_test, y_test)*100

## Over Sampling

In [None]:
y.value_counts()

In [None]:
from imblearn.over_sampling import RandomOverSampler
ro = RandomOverSampler()
ro_x, ro_y = ro.fit_resample(x, y)

In [None]:
ro_y.value_counts()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(ro_x, ro_y, test_size=0.2, random_state=42)

In [None]:
lr2 = LogisticRegression()
lr2.fit(x_train, y_train)

In [None]:
lr2.score(x_test, y_test)*100

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [None]:
cm = confusion_matrix(y_test, lr2.predict(x_test))

In [None]:
sns.heatmap(cm, annot=True)
plt.show()

In [None]:
print(precision_score(y_test, lr.predict(x_test))*100)
print(recall_score(y_test, lr.predict(x_test))*100)
print(f1_score(y_test, lr.predict(x_test))*100)

## Naive Bayes Algorithm

In [None]:
ros = RandomOverSampler()
ros_x, ros_y = ros.fit_resample(x, y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(ros_x, ros_y, test_size=0.2, random_state=42)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [None]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [None]:
gnb.score(x_test, y_test)*100

In [None]:
bnb = BernoulliNB()
bnb.fit(x_train, y_train)

In [None]:
bnb.score(x_test, y_test)*100