In [1]:
# Import all the modules

In [30]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [3]:
# Read the “churn.csv” dataset and store it into a dataframe

In [31]:
df = pd.read_csv("churn.csv")
df

Unnamed: 0,risk_type,age,dependents,income,health,churn
0,5.0,67,3,5,3,0
1,4.0,43,1,1,3,0
2,5.0,58,5,5,3,0
3,4.0,28,1,1,3,1
4,5.0,74,1,5,3,0
...,...,...,...,...,...,...
957,4.0,66,4,5,3,0
958,4.0,62,3,3,3,1
959,5.0,66,4,5,3,0
960,4.0,62,3,3,3,1


In [5]:
#The presence of NULL values in the data set will affect the accuracy of the data and will require 
#data cleaning process to take place before applying any predictive modelling approach.

#Provide the Python code that returns the Boolean value (i.e. True/False) on whether each variable in df contains NULL values.

In [32]:
df.isna().any()

risk_type      True
age           False
dependents    False
income        False
health        False
churn         False
dtype: bool

In [7]:
# for-loop to print out the number of NULL values in each variable in df.

In [33]:
for i in df.isnull().sum():
    print(i)

2
0
0
0
0
0


In [34]:

# There is another method to handle NULL value , its by replacing the NuLL values with mean.
# First find the rows having NULL values., Then find the mean of that particular column and 
# then replace the NULL values in the rows with that mean.
df

Unnamed: 0,risk_type,age,dependents,income,health,churn
0,5.0,67,3,5,3,0
1,4.0,43,1,1,3,0
2,5.0,58,5,5,3,0
3,4.0,28,1,1,3,1
4,5.0,74,1,5,3,0
...,...,...,...,...,...,...
957,4.0,66,4,5,3,0
958,4.0,62,3,3,3,1
959,5.0,66,4,5,3,0
960,4.0,62,3,3,3,1


In [35]:
# this shows the rows that have null values
df[df['risk_type'].isnull()] 

Unnamed: 0,risk_type,age,dependents,income,health,churn
20,,56,4,3,1,1
208,,46,1,1,1,1


In [36]:
# The below code finds the mean of the risk_type column, and then replace the null values with mean as 4.284823284823285

Nof = df["risk_type"].replace(np.nan,'0').astype(float)
NofMean = Nof.mean()
print(NofMean)
df["risk_type"]= df['risk_type'].fillna(NofMean)
df

4.284823284823285


Unnamed: 0,risk_type,age,dependents,income,health,churn
0,5.0,67,3,5,3,0
1,4.0,43,1,1,3,0
2,5.0,58,5,5,3,0
3,4.0,28,1,1,3,1
4,5.0,74,1,5,3,0
...,...,...,...,...,...,...
957,4.0,66,4,5,3,0
958,4.0,62,3,3,3,1
959,5.0,66,4,5,3,0
960,4.0,62,3,3,3,1


In [37]:
# This code just shows the row 20 and 208 was replaced by mean
df.iloc[20:210]

Unnamed: 0,risk_type,age,dependents,income,health,churn
20,4.284823,56,4,3,1,1
21,5.000000,43,1,3,3,0
22,4.000000,42,4,4,3,1
23,5.000000,59,2,4,3,0
24,4.000000,75,4,5,3,0
...,...,...,...,...,...,...
205,5.000000,46,2,3,3,0
206,4.000000,43,4,3,3,1
207,4.000000,66,2,1,3,0
208,4.284823,46,1,1,1,1


In [13]:
#For the variable “age”, bin each row of data based on the following criteria:
#- 1: age < 20 years 
# 2: 20 <= age < 40
#- 3: 40 <= age < 60
#- 4: 60 <= age < 80
#- 5: age >= 80

#The binned values are stored as integers.

In [38]:
df['age'] = pd.cut(
    x=df['age'],
    bins = [0,20,40,60,80,float('inf')],
    labels = [0, 1, 2, 3, 4]
)
df['age'] = df['age'].astype(int)

df

Unnamed: 0,risk_type,age,dependents,income,health,churn
0,5.0,3,3,5,3,0
1,4.0,2,1,1,3,0
2,5.0,2,5,5,3,0
3,4.0,1,1,1,3,1
4,5.0,3,1,5,3,0
...,...,...,...,...,...,...
957,4.0,3,4,5,3,0
958,4.0,3,3,3,3,1
959,5.0,3,4,5,3,0
960,4.0,3,3,3,3,1


In [15]:
#Extract a sample of 500 rows of data from df and store them into df itself.
#required to set random seed and random state to 2.

In [39]:

df = df.sample(500, random_state=2)
np.random.seed(2)
df

Unnamed: 0,risk_type,age,dependents,income,health,churn
846,4.0,4,2,1,3,1
284,4.0,3,4,3,3,0
272,5.0,2,4,4,3,0
692,6.0,1,2,1,3,0
900,4.0,2,1,1,3,0
...,...,...,...,...,...,...
652,4.0,3,2,1,3,1
829,4.0,2,3,3,3,0
879,4.0,2,4,4,3,1
525,5.0,3,2,4,3,0


In [40]:
#Create the numpy arrays named X and y, stores the values of all the input variables in df.

dfX = df[['risk_type','age', 'dependents','income','health']] 
dfy = df.churn


In [41]:
X = dfX.values
y = dfy.values

In [20]:
#Using the X and y numpy arrays, split the sample dataset into 75% training data and 25% testing data.

In [42]:

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=2)

In [43]:
# just to make sure the data split is 25% and 75% for test and train data 
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(375, 5)
(375,)
(125, 5)
(125,)


In [23]:
#Build a logistic regression model, logreg, and train the model using the training dataset with the following model specifications
# Random State = 36
#C = 1e8
# Solver = ‘lbfgs’
# Maximum Iterations = 10000

In [44]:

logreg = LogisticRegression(random_state=36,solver='lbfgs', max_iter=10000)
logreg.fit(X_train,y_train)


LogisticRegression(max_iter=10000, random_state=36)

In [45]:
y_pred=logreg.predict(X_test)
print(y_pred)

[1 1 0 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 1 0 0 0
 0 0 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 0
 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 1 0 1 1 1 1
 0 0 0 1 1 1 0 0 0 0 1 0 1 1]


In [None]:
#Using the logreg model, predict the target values using testing data. 

In [46]:
predictNew = logreg.predict([[4.0,3,2,1,3]])
print(predictNew)

[1]


In [47]:
predictNew = logreg.predict([[4.0,3,4,5,3]])
print(predictNew)

[0]


In [28]:
#Printing the test accuracy of the logreg model.

In [48]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.76


>