In [1]:
#introducing sklearn, which is the "standard" machine learning package for python
from sklearn import linear_model
import pandas as pd

In [2]:
data = pd.read_csv('Customer-Churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# let's try to use the variables "tenure","senior citizen" and "monthly charges" to predict "churn"
# PS - churn is the name given to the event of a client abandoning a company
X = data[['tenure','SeniorCitizen','MonthlyCharges']]
X.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.3
4,2,0,70.7


In [4]:
y = pd.DataFrame(data=data, columns=['Churn'])
y.head()

Unnamed: 0,Churn
0,No
1,No
2,Yes
3,No
4,Yes


In [5]:
from sklearn.preprocessing import StandardScaler

#notice us all fancy, doing a standard scaler on the data
transformer = StandardScaler().fit(X)
scaled_x = pd.DataFrame(transformer.transform(X),columns = X.columns)


In [6]:
scaled_x

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-1.277445,-0.439916,-1.160323
1,0.066327,-0.439916,-0.259629
2,-1.236724,-0.439916,-0.362660
3,0.514251,-0.439916,-0.746535
4,-1.236724,-0.439916,0.197365
...,...,...,...
7038,-0.340876,-0.439916,0.665992
7039,1.613701,-0.439916,1.277533
7040,-0.870241,-0.439916,-1.168632
7041,-1.155283,2.273159,0.320338


In [7]:
model = linear_model.LogisticRegression(random_state=0)
result = model.fit(scaled_x, data['Churn'])

In [8]:
new_clients = pd.DataFrame([{'tenure':2,'SeniorCitizen':1,'MonthlyCharges':90.20},
 {'tenure':40,'SeniorCitizen':1,'MonthlyCharges':32.45},
 {'tenure':3,'SeniorCitizen':0,'MonthlyCharges':85.15},
 {'tenure':44,'SeniorCitizen':0,'MonthlyCharges':40.45}])
new_clients

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,2,1,90.2
1,40,1,32.45
2,3,0,85.15
3,44,0,40.45


In [9]:
new_clients_scaled = pd.DataFrame(transformer.transform(new_clients),columns=new_clients.columns)
new_clients_scaled

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,-1.236724,2.273159,0.845466
1,0.31065,2.273159,-1.073909
2,-1.196004,-0.439916,0.677625
3,0.473531,-0.439916,-0.808022


In [10]:
result.predict(new_clients_scaled)

array(['Yes', 'No', 'Yes', 'No'], dtype=object)

In [11]:
# what about R^2 score?
# in this sense it does not make a lot of sense to use R^2 score because this is a *classification* exercise
# we can still use it but it's not as interesting a metric.
# instead we use "accuracy", i.e., what % of observations do we get right

from sklearn.metrics import accuracy_score

accuracy_score(result.predict(scaled_x),y)

0.7911401391452506

In [12]:
# 80% aqccuracy is nothing to scoff at.