In [4]:
import pandas as pd
import numpy as np

path = 'WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv'

# Read the csv data
df = pd.read_csv(path)
df.columns = ['Customer','State','round_Customer Lifetime Value','Response','Coverage','Education','Effective To Date','EmploymentStatus',
              'Gender','Income','Location Code','Marital Status','Monthly Premium Auto','Months Since Last Claim',
              'Months Since Policy Inception','Number of Open Complaints','Number of Policies','Policy Type','Policy','Renew Offer Type',
              'Sales Channel','Total Claim Amount','Vehicle Class','Vehicle Size']


In [5]:
#Print the unique value for each feature
for feature in df.columns:
    print(feature, np.unique(df[feature]))
print()
selectedFeature = ['Income', 'Coverage', 'Education', 'EmploymentStatus', 'Monthly Premium Auto', 'Months Since Last Claim', 
                   'Number of Open Complaints','Total Claim Amount','State','Gender','Marital Status','Policy','Vehicle Class',
                   'Vehicle Size']

Customer ['AA10041' 'AA11235' 'AA16582' ... 'ZZ89380' 'ZZ91716' 'ZZ97035']
State ['Arizona' 'California' 'Nevada' 'Oregon' 'Washington']
round_Customer Lifetime Value [ 1898.007675  1898.683686  1904.000852 ... 73225.95652  74228.51604
 83325.38119 ]
Response ['No' 'Yes']
Coverage ['Basic' 'Extended' 'Premium']
Education ['Bachelor' 'College' 'Doctor' 'High School or Below' 'Master']
Effective To Date ['1/1/11' '1/10/11' '1/11/11' '1/12/11' '1/13/11' '1/14/11' '1/15/11'
 '1/16/11' '1/17/11' '1/18/11' '1/19/11' '1/2/11' '1/20/11' '1/21/11'
 '1/22/11' '1/23/11' '1/24/11' '1/25/11' '1/26/11' '1/27/11' '1/28/11'
 '1/29/11' '1/3/11' '1/30/11' '1/31/11' '1/4/11' '1/5/11' '1/6/11'
 '1/7/11' '1/8/11' '1/9/11' '2/1/11' '2/10/11' '2/11/11' '2/12/11'
 '2/13/11' '2/14/11' '2/15/11' '2/16/11' '2/17/11' '2/18/11' '2/19/11'
 '2/2/11' '2/20/11' '2/21/11' '2/22/11' '2/23/11' '2/24/11' '2/25/11'
 '2/26/11' '2/27/11' '2/28/11' '2/3/11' '2/4/11' '2/5/11' '2/6/11'
 '2/7/11' '2/8/11' '2/9/11']
EmploymentSta

In [6]:
factorizeFeatures = ['Coverage', 'Education','EmploymentStatus','State','Gender','Marital Status','Policy','Vehicle Class','Vehicle Size']
for feature in factorizeFeatures:
    labels, uniques = pd.factorize(df[feature].values)
    print(labels, uniques)
    df[feature] = labels

[0 1 2 ... 1 1 1] ['Basic' 'Extended' 'Premium']
[0 0 0 ... 0 1 1] ['Bachelor' 'College' 'Master' 'High School or Below' 'Doctor']
[0 1 0 ... 1 0 1] ['Employed' 'Unemployed' 'Medical Leave' 'Disabled' 'Retired']
[0 1 2 ... 3 3 3] ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon']
[0 0 0 ... 1 1 1] ['F' 'M']
[0 1 0 ... 1 0 1] ['Married' 'Single' 'Divorced']
[0 1 1 ... 2 6 0] ['Corporate L3' 'Personal L3' 'Corporate L2' 'Personal L1' 'Special L2'
 'Corporate L1' 'Personal L2' 'Special L1' 'Special L3']
[0 1 0 ... 1 1 0] ['Two-Door Car' 'Four-Door Car' 'SUV' 'Luxury SUV' 'Sports Car'
 'Luxury Car']
[0 0 0 ... 0 2 0] ['Medsize' 'Small' 'Large']


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
x = df.loc[:, selectedFeature].values
y = df.loc[:, 'round_Customer Lifetime Value'].values

print(x)

[[5.6274e+04 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 1.0000e+00 0.0000e+00 ... 1.0000e+00 1.0000e+00 0.0000e+00]
 [4.8767e+04 2.0000e+00 0.0000e+00 ... 1.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [0.0000e+00 1.0000e+00 0.0000e+00 ... 2.0000e+00 1.0000e+00 0.0000e+00]
 [2.1941e+04 1.0000e+00 1.0000e+00 ... 6.0000e+00 1.0000e+00 2.0000e+00]
 [0.0000e+00 1.0000e+00 1.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]]


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [12]:
forest = RandomForestClassifier(n_estimators=300, max_depth=2,random_state=0, n_jobs=-1)
forest.fit(x_train, y_train.astype('int'))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [13]:
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, selectedFeature[indices[f]], importances[indices[f]]))

 1) Monthly Premium Auto           0.179689
 2) EmploymentStatus               0.170628
 3) Income                         0.138377
 4) Total Claim Amount             0.109072
 5) Months Since Last Claim        0.079555
 6) Marital Status                 0.079249
 7) Vehicle Class                  0.052277
 8) Number of Open Complaints      0.051194
 9) Education                      0.048650
10) Coverage                       0.039699
11) Vehicle Size                   0.032646
12) Gender                         0.009970
13) Policy                         0.005075
14) State                          0.003920
