Importing Libraries

In [25]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Reading the data

In [30]:
data = pd.read_csv('telecom_customer_churn.csv')
data = data.drop(['Customer ID'],axis=1)
data.head()

Unnamed: 0,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,9,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,1,,
1,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,9,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,1,,
2,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,4,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,0,Competitor,Competitor had better devices
3,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,13,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,0,Dissatisfaction,Product dissatisfaction
4,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,0,Dissatisfaction,Network reliability


Pre-Processing Techniques

In [31]:
def ThresholdandND_columnRemoval(df):
    N = len(df)
    columns = df.columns
    for col in columns:
        if (len(df[col].unique()) == 1):
            df = df.drop([col],axis=1)
            continue
        notnull = df[col].isnull().sum()
        ratio = notnull / N
        if(ratio >= 0.30):
            df = df.drop([col],axis=1)
    return df

def Handling_NullValues(df):
    columns = df.columns
    for col in columns:
        typeCol = str(df[col].dtype)
        if typeCol == 'object':
            df = df[df[col].notna()]
        else:
            mean = df[col].mean()
            median = df[col].median()
            standard_deviation = df[col].std()
            pmc = (3 * (mean - median)) / standard_deviation
            if pmc >= 0.4 or pmc <= -0.4:
                df[col] = df[col].fillna(median)
            else:
                df[col] = df[col].fillna(mean)
    return df

def OneHotEncoding_objects(df):
    columns = df.columns
    for col in columns:
        typeCol = str(df[col].dtype)
        if typeCol == 'object':
            enc = pd.get_dummies(df[col])
            encCol = enc.columns
            newColumns = {}
            for i in range(0,len(encCol)):
                newColumns[encCol[i]] = col + encCol[i]
            enc.rename(columns = newColumns, inplace = True)
            df = df.join(enc)
            df = df.drop([col],axis=1)
    return df

In [32]:
dataFeatures = OneHotEncoding_objects(Handling_NullValues(ThresholdandND_columnRemoval(data)))
dataFeatures

Unnamed: 0,Age,Number of Dependents,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,...,Unlimited DataNo,Unlimited DataYes,ContractMonth-to-Month,ContractOne Year,ContractTwo Year,Paperless BillingNo,Paperless BillingYes,Payment MethodBank Withdrawal,Payment MethodCredit Card,Payment MethodMailed Check
0,37,0,93225,34.827662,-118.999073,2,9,42.39,16.0,65.60,...,0,1,0,1,0,0,1,0,1,0
1,46,0,91206,34.162515,-118.203869,0,9,10.69,10.0,-4.00,...,1,0,1,0,0,1,0,0,1,0
2,50,0,92627,33.645672,-117.922613,0,4,33.65,30.0,73.90,...,0,1,1,0,0,0,1,1,0,0
3,78,0,94553,38.014457,-122.115432,1,13,27.82,4.0,98.00,...,0,1,1,0,0,0,1,1,0,0
4,75,0,93010,34.227846,-119.079903,3,3,7.38,11.0,83.90,...,0,1,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6582,20,0,90022,34.023810,-118.156582,0,7,36.49,42.0,94.05,...,0,1,0,1,0,0,1,0,1,0
6583,53,0,93628,36.807595,-118.901544,0,1,42.09,9.0,70.15,...,0,1,1,0,0,0,1,0,1,0
6585,20,0,91941,32.759327,-116.997260,0,13,46.68,59.0,55.15,...,0,1,0,1,0,1,0,0,1,0
6586,40,0,95367,37.734971,-120.954271,1,22,16.20,17.0,85.10,...,0,1,1,0,0,0,1,1,0,0


Scaling with MinMaxScaler

In [33]:
scaler = MinMaxScaler()
cols = dataFeatures.columns
data_scale = scaler.fit_transform(dataFeatures.to_numpy())
data_scale = pd.DataFrame(data_scale, columns=cols)
data_scale

Unnamed: 0,Age,Number of Dependents,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,...,Unlimited DataNo,Unlimited DataYes,ContractMonth-to-Month,ContractOne Year,ContractTwo Year,Paperless BillingNo,Paperless BillingYes,Payment MethodBank Withdrawal,Payment MethodCredit Card,Payment MethodMailed Check
0,0.295082,0.0,0.524313,0.241523,0.524540,0.181818,0.112676,0.844835,0.168675,0.587184,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.442623,0.0,0.195967,0.170810,0.603207,0.000000,0.112676,0.197632,0.096386,0.046602,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.508197,0.0,0.427061,0.115863,0.631031,0.000000,0.042254,0.666394,0.337349,0.651650,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.967213,0.0,0.740283,0.580316,0.216248,0.090909,0.169014,0.547366,0.024096,0.838835,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.918033,0.0,0.489348,0.177755,0.516544,0.272727,0.028169,0.130053,0.108434,0.729320,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.016393,0.0,0.003415,0.156064,0.607885,0.000000,0.084507,0.724377,0.481928,0.808155,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4597,0.557377,0.0,0.589852,0.452013,0.534188,0.000000,0.000000,0.838710,0.084337,0.622524,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4598,0.016393,0.0,0.315498,0.021634,0.722573,0.000000,0.169014,0.932421,0.686747,0.506019,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4599,0.344262,0.0,0.872662,0.550604,0.331118,0.090909,0.295775,0.310127,0.180723,0.738641,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


Getting X and Y Values

In [34]:
target = data_scale['Customer Status']
ivCol = list(data_scale.columns)
ivCol.remove('Customer Status')
independent_variables = data_scale[ivCol]
independent_variables

Unnamed: 0,Age,Number of Dependents,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,...,Unlimited DataNo,Unlimited DataYes,ContractMonth-to-Month,ContractOne Year,ContractTwo Year,Paperless BillingNo,Paperless BillingYes,Payment MethodBank Withdrawal,Payment MethodCredit Card,Payment MethodMailed Check
0,0.295082,0.0,0.524313,0.241523,0.524540,0.181818,0.112676,0.844835,0.168675,0.587184,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.442623,0.0,0.195967,0.170810,0.603207,0.000000,0.112676,0.197632,0.096386,0.046602,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.508197,0.0,0.427061,0.115863,0.631031,0.000000,0.042254,0.666394,0.337349,0.651650,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.967213,0.0,0.740283,0.580316,0.216248,0.090909,0.169014,0.547366,0.024096,0.838835,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.918033,0.0,0.489348,0.177755,0.516544,0.272727,0.028169,0.130053,0.108434,0.729320,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.016393,0.0,0.003415,0.156064,0.607885,0.000000,0.084507,0.724377,0.481928,0.808155,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4597,0.557377,0.0,0.589852,0.452013,0.534188,0.000000,0.000000,0.838710,0.084337,0.622524,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4598,0.016393,0.0,0.315498,0.021634,0.722573,0.000000,0.169014,0.932421,0.686747,0.506019,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4599,0.344262,0.0,0.872662,0.550604,0.331118,0.090909,0.295775,0.310127,0.180723,0.738641,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


Training the model

In [35]:
x_train, x_test, y_train, y_test = train_test_split(independent_variables, target, test_size=0.3,random_state=6789,shuffle=True)
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Predicting values

In [36]:
y_pred = logisticRegr.predict(x_test)
y_pred

array([0., 0., 0., ..., 1., 1., 1.])

Checking accuracy of model

In [37]:
accuracy = accuracy_score(y_test,y_pred)
confusion_mat = confusion_matrix(y_test,y_pred)
crep = classification_report(y_test,y_pred)

print("Accuracy: " + str(accuracy))
print("Confusion Matrix: ")
print(confusion_mat)
print("Classification Report: ")
print(crep)

Accuracy: 0.832005792903693
Confusion Matrix: 
[[349 112]
 [120 800]]
Classification Report: 
              precision    recall  f1-score   support

         0.0       0.74      0.76      0.75       461
         1.0       0.88      0.87      0.87       920

    accuracy                           0.83      1381
   macro avg       0.81      0.81      0.81      1381
weighted avg       0.83      0.83      0.83      1381

