Import Libraries

In [52]:
import pandas as pd
import numpy as np

from scipy.sparse import issparse
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,RandomizedSearchCV

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score


# Data gathering

In [33]:
def dataLoader(path):
    ds = pd.read_csv(path)
    return ds 


In [34]:
ds = dataLoader('../Data/e_commerce.csv')
ds.shape

(3333, 20)

In [35]:
# split data 

x = ds.drop(['churn'],axis=1)
y = ds['churn']

x


Unnamed: 0,account length,location code,user id,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,avg order value,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls
0,128,415,3824657,no,yes,25,265,45,17,110.0,197,87,244.7,91,11.01,10.0,3,2.70,1.0
1,107,415,3717191,no,yes,26,162,27,17,123.0,196,103,254.4,103,11.45,13.7,3,3.70,1.0
2,137,415,3581921,no,no,0,243,41,10,114.0,121,110,162.6,104,7.32,12.2,5,3.29,0.0
3,84,408,3759999,yes,no,0,299,51,5,71.0,62,88,196.9,89,8.86,6.6,7,1.78,2.0
4,75,415,3306626,yes,no,0,167,28,13,113.0,148,122,186.9,121,8.41,10.1,3,2.73,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,415,4144276,no,yes,36,156,27,18,77.0,216,126,279.1,83,12.56,9.9,6,2.67,2.0
3329,68,415,3703271,no,no,0,231,39,13,,153,55,191.3,123,8.61,9.6,4,2.59,3.0
3330,28,510,3288230,no,no,0,181,31,25,109.0,289,58,191.9,91,8.64,14.1,6,3.81,2.0
3331,184,510,3646381,yes,no,0,214,36,14,105.0,160,84,139.2,137,6.26,5.0,10,1.35,2.0


In [74]:

cat = x.select_dtypes(include='object').columns
num = x.select_dtypes(exclude='object').columns

cat_pipe = Pipeline(steps=([('impute',SimpleImputer(strategy='most_frequent')),('Encoding',OrdinalEncoder())]))
num_pipe = Pipeline(steps=([('impute',SimpleImputer(strategy='mean')),('scaling',StandardScaler())]))

pipeline = ColumnTransformer([('catPipe',cat_pipe,cat),('numPipe',num_pipe,num)])

x1 = pipeline.fit_transform(x)

if issparse(x1):
    x1 = x1.toarray()
x2 = pd.DataFrame(x1,columns=pipeline.get_feature_names_out())

x2

Unnamed: 0,catPipe__credit card info save,catPipe__push status,numPipe__account length,numPipe__location code,numPipe__user id,numPipe__add to wishlist,numPipe__desktop sessions,numPipe__app sessions,numPipe__desktop transactions,numPipe__total product detail views,numPipe__session duration,numPipe__promotion clicks,numPipe__avg order value,numPipe__sale product views,numPipe__discount rate per visited products,numPipe__product detail view per app session,numPipe__app transactions,numPipe__add to cart per session,numPipe__customer service calls
0,0.0,1.0,0.676489,-0.523603,0.285359,1.234883,1.564550,1.557193,-0.020265,4.750568e-01,-0.079666,-0.658138,0.866743,-0.465494,0.866029,-0.088063,-0.601195,-0.085690,-0.430100
1,0.0,1.0,0.149065,-0.523603,-0.105965,1.307948,-0.327130,-0.384977,-0.020265,1.129111e+00,-0.099387,0.145038,1.058571,0.147825,1.059390,1.249269,-0.601195,1.241169,-0.430100
2,0.0,0.0,0.902529,-0.523603,-0.598534,-0.591760,1.160502,1.125600,-1.639456,6.763043e-01,-1.578480,0.496427,-0.756869,0.198935,-0.755571,0.707108,0.211534,0.697156,-1.196647
3,1.0,0.0,-0.428590,-0.688834,0.049915,-0.591760,2.188988,2.204583,-2.796021,-1.487106e+00,-2.742033,-0.607939,-0.078551,-0.567714,-0.078806,-1.316962,1.024263,-1.306401,0.336447
4,1.0,0.0,-0.654629,-0.523603,-1.600987,-0.591760,-0.235301,-0.277078,-0.945517,6.259924e-01,-1.046007,1.098809,-0.276311,1.067803,-0.276562,-0.051918,-0.601195,-0.045885,1.102994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,0.0,1.0,2.283878,-0.523603,1.449212,2.038605,-0.437325,-0.384977,0.211048,-1.185235e+00,0.295038,1.299603,1.547039,-0.874374,1.547188,-0.124207,0.617898,-0.125496,0.336447
3329,0.0,0.0,-0.830437,-0.523603,-0.156653,-0.591760,0.940112,0.909803,-0.945517,7.149746e-16,-0.947401,-2.264489,-0.189297,1.170023,-0.188670,-0.232639,-0.194831,-0.231645,1.102994
3330,0.0,0.0,-1.835055,1.718817,-1.667974,-0.591760,0.021821,0.046617,1.830239,4.247450e-01,1.734688,-2.113894,-0.177431,-0.465494,-0.175486,1.393846,0.617898,1.387123,0.336447
3331,1.0,0.0,2.082955,1.718817,-0.363811,-0.591760,0.627893,0.586108,-0.714204,2.234975e-01,-0.809352,-0.808733,-1.219628,1.885562,-1.221396,-1.895268,2.243356,-1.876950,0.336447


# Model training 

In [75]:
x_train,x_test,y_train,y_test = train_test_split(x2,y,test_size=0.2,random_state=32)

In [76]:
adc = AdaBoostClassifier()
svc =  SVC()
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()


In [77]:
models = [adc,svc,knc,dtc,rfc]

for i in models:
    i.fit(x_train,y_train)
    
    y_pred_train = i.predict(x_train)
    y_pred = i.predict(x_test)
    
    tr=round(f1_score(y_pred_train,y_train),2)
    ts=round(f1_score(y_pred,y_test),2)
    
    print('*'*50)
    print(i)
    print('Trining F1 score:',tr)
    print('Testing F1 score:',ts)
  

**************************************************
AdaBoostClassifier()
Trining F1 score: 0.5
Testing F1 score: 0.48
**************************************************
SVC()
Trining F1 score: 0.73
Testing F1 score: 0.53
**************************************************
KNeighborsClassifier()
Trining F1 score: 0.52
Testing F1 score: 0.33
**************************************************
DecisionTreeClassifier()
Trining F1 score: 1.0
Testing F1 score: 0.69
**************************************************
RandomForestClassifier()
Trining F1 score: 1.0
Testing F1 score: 0.8


In [78]:
# here the best algorithm for classification come out to be  Adaboost classifier

## Hyperparameter tuning for best algorithm

In [79]:
grid = {
    'n_estimators':range(2,250)
}

In [80]:
rs = RandomizedSearchCV(adc,param_distributions=grid,cv=5)

rs.fit(x_train,y_train)

In [81]:
adc1 = rs.best_estimator_

In [82]:
adc1

In [83]:
# Evaluation

def Evaluation(actual,pred):
    accu = accuracy_score(actual,pred)
    cfm = confusion_matrix(actual,pred)
    ftr = round(f1_score(actual,pred),2)
    fts = round(f1_score(actual,pred),2)
    print('_'*30)
    print('Accuracy:',accu)
    print('matrix\n',cfm)
    print('_'*30)
    print('Train-F1:',ftr)
    print('Test-F1:',fts)

In [84]:
y_pred1 = adc1.predict(x_train)
y_pred2 = adc1.predict(x_test)

Evaluation(y_train,y_pred1)
Evaluation(y_test,y_pred2)

______________________________
Accuracy: 0.8889722430607652
matrix
 [[2221   60]
 [ 236  149]]
______________________________
Train-F1: 0.5
Test-F1: 0.5
______________________________
Accuracy: 0.8710644677661169
matrix
 [[548  21]
 [ 65  33]]
______________________________
Train-F1: 0.43
Test-F1: 0.43


In [92]:
df = dataLoader('../Data/testing_ecommerce.csv')
df.shape

(1000, 19)

In [101]:
def Prediction(path,pipe,model):
    
    data = pd.read_csv(path)
    # x = data.drop(['churn'],axis=1)
    # y = data['churn']
    # Preprocess the data
    x1 = pd.DataFrame(pipe.transform(data),columns=pipe.get_feature_names_out())

    pred = model.predict(x1)

    return pred

In [102]:
Pred = Prediction('../Data/testing_ecommerce.csv',pipeline,adc1)
Pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

In [103]:
result = df[['user id']]
result

Unnamed: 0,user id
0,3802929
1,3895988
2,4067737
3,4218141
4,4042871
...,...
995,3312144
996,3573187
997,4205990
998,3738900


In [105]:
result['Pred Churn'] = Pred
result


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Pred Churn'] = Pred


Unnamed: 0,user id,Pred Churn
0,3802929,0
1,3895988,0
2,4067737,0
3,4218141,0
4,4042871,0
...,...,...
995,3312144,0
996,3573187,0
997,4205990,0
998,3738900,0


In [106]:
result.to_csv('Prediction.csv')