In [468]:
%matplotlib inline

In [1]:
!pip install kmodes

Collecting kmodes
  Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.12.2



[notice] A new release of pip is available: 23.0.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.random.seed(1354)

# 1. Loading Data

In [3]:
df = pd.read_csv('./Customer-Value-Analysis.csv')

In [4]:
df.shape

(9134, 24)

In [5]:
df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [9]:
df['Engaged'] = df['Response'].apply(lambda x: 0 if x == 'No' else 1)
df['MaritalFactorized'] = df['Marital Status'].apply(lambda x: 0 if x == 'Single' else 1)
df['GenderFactorized'] = df['Gender'].apply(lambda x: 0 if x == 'F' else 1)
df['EducationFactorized'] = pd.factorize(df['Education'])[0]
df['Income'] = pd.to_numeric(df['Income'])

In [10]:
df.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Engaged,MaritalFactorized,GenderFactorized,EducationFactorized
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize,0,1,0,0
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize,0,0,0,0
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize,0,1,0,0
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize,0,1,1,0
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize,0,0,1,0


## LR with X

In [44]:
y = df['Engaged'] 
X = df[['Monthly Premium Auto',
        'Months Since Last Claim',
        'Months Since Policy Inception',
        'Number of Open Complaints',
        'Number of Policies',
        'Total Claim Amount',
    ]]
Z = df[['Customer Lifetime Value',
        'Income',
        'MaritalFactorized',
        'GenderFactorized',
        'EducationFactorized'
    ]]

In [19]:
from sklearn import model_selection
X_tran,X_test,y_tran,y_test=model_selection.train_test_split(X,y,test_size=0.1)
print(X_test.shape)  # traning and test set

(914, 6)


In [20]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(multi_class='ovr',solver='lbfgs',class_weight='balanced')
lr.fit(X_tran,y_tran)
score=lr.score(X_tran,y_tran)
print(score) ## best is 1

0.5273722627737226


In [22]:
from sklearn.metrics import accuracy_score
train_score=accuracy_score(y_tran,lr.predict(X_tran))
test_score=lr.score(X_test,y_test)
print('training set acurrcy rate:',train_score)
print('test set acurracy rate:',test_score)

training set acurrcy rate: 0.5273722627737226
test set acurracy rate: 0.5010940919037199


In [23]:
from sklearn.metrics import recall_score
train_recall=recall_score(y_tran,lr.predict(X_tran),average='macro')
test_recall=recall_score(y_test,lr.predict(X_test),average='macro')
print('training set recall rate:',train_recall)
print('test set recall rate:',test_recall)

training set recall rate: 0.5440473518790161
test set recall rate: 0.495778403199526


## LR with X_new

In [39]:
from kmodes.kmodes import KModes
km = KModes(n_clusters=2, init='Huang', n_init=10, verbose=0)
new_var = km.fit_predict(Z)

In [46]:
import numpy as np

X_new = X.mul(new_var, axis=0)
X_tr,X_te,y_tr,y_te=model_selection.train_test_split(X_new, y, test_size=0.1)

lr=LogisticRegression(multi_class='ovr', solver='lbfgs', class_weight='balanced')
lr.fit(X_tr, y_tr)
train_score=accuracy_score(y_tr, lr.predict(X_tr))
test_score=lr.score(X_te, y_te)
print('training set acurrcy rate:', train_score)
print('test set acurracy rate:', test_score)

train_recall=recall_score(y_tr, lr.predict(X_tr),average='macro')
test_recall=recall_score(y_te, lr.predict(X_te),average='macro')
print('training set recall rate:',train_recall)
print('test set recall rate:',test_recall)

training set acurrcy rate: 0.7403892944038929
test set acurracy rate: 0.7527352297592997
training set recall rate: 0.52743135321714
test set recall rate: 0.5362070920913352


#### The accuracy is higher when training with X_new, while the recall remains almost unchanged. This is because X_new contains more informative features or features that are more important for the model, which can lead to better prediction of sample categories. As a result, the accuracy increases. The recall remains almost unchanged because the model is still able to identify positive samples, but there is no significant improvement in its ability to discriminate negative samples.