In [465]:
#import modules and set parameters
import pandas as pd
import numpy as np
from sklearn import decomposition
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score

In [466]:
#import dataset
train = pd.read_csv(r'C:\Users\tolagu\Documents\train_users_2.csv')

In [467]:
#check shape of my raw data
train.shape

(213451, 16)

In [470]:
#quick data check
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [471]:
#obtain summary statistics 
train.describe()

Unnamed: 0,timestamp_first_active,age,signup_flow
count,213451.0,125461.0,213451.0
mean,20130850000000.0,49.668335,3.267387
std,9253717000.0,155.666612,7.637707
min,20090320000000.0,1.0,0.0
25%,20121230000000.0,28.0,0.0
50%,20130910000000.0,34.0,0.0
75%,20140310000000.0,43.0,0.0
max,20140630000000.0,2014.0,25.0


In [472]:
#find null in my data
train.isnull().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
dtype: int64

In [123]:
#remove rows where age is less than 18
train_remove_age_4 = train.ix[train['age'] >= 18]

In [430]:
#remove rows where age is greater than 100
train_remove_age_5 = train_remove_age_4.ix[train['age'] <= 100]

In [431]:
#check result after rows with age < 18 is removed from dataset
train_remove_age_4.groupby('age').size()

age
18.0       669
19.0      1102
20.0       540
21.0       982
22.0      1702
23.0      2462
24.0      3220
25.0      4459
26.0      5044
27.0      5738
28.0      5939
29.0      5963
30.0      6124
31.0      6016
32.0      5855
33.0      5527
34.0      5029
35.0      4860
36.0      4083
37.0      3694
38.0      3384
39.0      2998
40.0      2766
41.0      2538
42.0      2243
43.0      2056
44.0      2137
45.0      2149
46.0      1875
47.0      1646
          ... 
108.0       15
109.0       31
110.0      196
111.0        2
112.0        1
113.0        4
115.0       13
132.0        1
150.0        1
1924.0       2
1925.0       1
1926.0       1
1927.0       1
1928.0       2
1929.0       2
1931.0       3
1932.0       3
1933.0       1
1935.0       1
1936.0       2
1938.0       1
1942.0       1
1947.0       2
1949.0       3
1952.0       1
1953.0       1
1995.0       1
2008.0       1
2013.0      39
2014.0     710
dtype: int64

In [432]:
#check result after rows with age > 100 is removed from dataset
train_remove_age_5.groupby('age').size()

age
18.0      669
19.0     1102
20.0      540
21.0      982
22.0     1702
23.0     2462
24.0     3220
25.0     4459
26.0     5044
27.0     5738
28.0     5939
29.0     5963
30.0     6124
31.0     6016
32.0     5855
33.0     5527
34.0     5029
35.0     4860
36.0     4083
37.0     3694
38.0     3384
39.0     2998
40.0     2766
41.0     2538
42.0     2243
43.0     2056
44.0     2137
45.0     2149
46.0     1875
47.0     1646
         ... 
71.0      180
72.0      189
73.0      149
74.0      123
75.0       94
76.0       68
77.0       72
78.0       50
79.0       50
80.0       46
81.0       30
82.0       26
83.0       25
84.0       20
85.0       26
86.0       27
87.0       31
88.0       12
89.0       13
90.0       18
91.0       12
92.0       14
93.0       18
94.0       12
95.0       49
96.0       25
97.0       10
98.0       15
99.0       17
100.0      26
dtype: int64

In [128]:
#check for null after removing rows
train_remove_age_5.isnull().sum()

id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         55769
gender                         0
age                            0
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked     1960
signup_app                     0
first_device_type              0
first_browser                  0
country_destination            0
dtype: int64

In [134]:
#check count of total number of rows
train_remove_age_5.id.value_counts().sum()

122958

In [131]:
#check shape of data after initial transformations
train_remove_age_5.shape

(122958, 16)

In [201]:
#drop date_first_booking from dataset
train_remove_age_6 = train_remove_age_5.drop('date_first_booking', axis=1)

In [202]:
#drop first_affiliated_tracked from dataset
train_remove_age_7 = train_remove_age_6.drop('first_affiliate_tracked', axis=1)

In [203]:
#drop date_account_created from dataset
train_remove_age_8 = train_remove_age_7.drop('date_account_created', axis=1)

In [204]:
#drop timestamp_first_active from dataset
train_remove_age_9 = train_remove_age_8.drop('timestamp_first_active', axis=1)

In [234]:
#drop sign_up flow from dataset
train_remove_age_10 = train_remove_age_9.drop('signup_flow', axis=1)

In [440]:
#check the header of dataset after removing columns
train_remove_age_10.head(0)

Unnamed: 0,id,gender,age,signup_method,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,first_browser,country_destination


In [235]:
#Get dummies for the gender variable
train_code_1 = pd.get_dummies(train_remove_age_10['gender'])

In [281]:
train_code_1.head()

Unnamed: 0,-unknown-,FEMALE,MALE,OTHER
1,0,0,1,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
6,0,1,0,0


In [282]:
#Get dummies for the signup_method variable
train_code_2 = pd.get_dummies(train_remove_age_10['signup_method'])

In [283]:
#Get dummies for the language variable
train_code_3 = pd.get_dummies(train_remove_age_10['language'])

In [264]:
#Get dummies for the affiliate_channel variable
train_code_4 = pd.get_dummies(train_remove_age_10['affiliate_channel'])

In [265]:
#Get dummies for the affiliate_provider variable
train_code_5 = pd.get_dummies(train_remove_age_10['affiliate_provider'])

In [266]:
#Get dummies for the signup_app variable
train_code_6 = pd.get_dummies(train_remove_age_10['signup_app'])

In [267]:
#Get dummies for the first_device_type variable
train_code_7 = pd.get_dummies(train_remove_age_10['first_device_type'])

In [268]:
#Get dummies for the first_browser variable
train_code_8 = pd.get_dummies(train_remove_age_10['first_browser'])

In [269]:
#Get dummies for the country_destination variable
train_code_9 = pd.get_dummies(train_remove_age_10['country_destination'])

In [270]:
#joind data
test_data_1 = train_code_9.join(train_code_8)

In [294]:
#join data
test_data_2 = test_data_1.join(train_code_7)

In [295]:
#join data
test_data_3 = test_data_2.join(train_code_6)

In [297]:
#check null values for data
test_data_3.isnull().sum()

AU                    0
CA                    0
DE                    0
ES                    0
FR                    0
GB                    0
IT                    0
NDF                   0
NL                    0
PT                    0
US                    0
other                 0
-unknown-             0
AOL Explorer          0
Android Browser       0
Apple Mail            0
Avant Browser         0
BlackBerry Browser    0
Camino                0
Chrome                0
Chrome Mobile         0
Chromium              0
CometBird             0
Comodo Dragon         0
CoolNovo              0
Firefox               0
IE                    0
IE Mobile             0
IceWeasel             0
Iron                  0
                     ..
Opera                 0
Opera Mini            0
Opera Mobile          0
PS Vita browser       0
Pale Moon             0
RockMelt              0
Safari                0
SeaMonkey             0
Silk                  0
SiteKiosk             0
SlimBrowser     

In [299]:
#join data
test_data_4 = test_data_3.join(train_code_5)

In [300]:
#join data
test_data_5 = test_data_4.join(train_code_4)

In [301]:
#join data
test_data_6 = test_data_5.join(train_code_3)

In [303]:
#check null values in data
test_data_6.isnull().sum()

AU                    0
CA                    0
DE                    0
ES                    0
FR                    0
GB                    0
IT                    0
NDF                   0
NL                    0
PT                    0
US                    0
other                 0
-unknown-             0
AOL Explorer          0
Android Browser       0
Apple Mail            0
Avant Browser         0
BlackBerry Browser    0
Camino                0
Chrome                0
Chrome Mobile         0
Chromium              0
CometBird             0
Comodo Dragon         0
CoolNovo              0
Firefox               0
IE                    0
IE Mobile             0
IceWeasel             0
Iron                  0
                     ..
other_ch              0
remarketing           0
sem-brand             0
sem-non-brand         0
seo                   0
ca                    0
cs                    0
da                    0
de                    0
el                    0
en              

In [304]:
#rename column
test_data_6 = test_data_6.rename(columns={'other': 'other_we'})

In [307]:
#rename column
train_code_2 = train_code_2.rename(columns={'basic': 'basic_signup', 'facebook': 'fb_signup', 'google': 'google_signup'})

In [309]:
#join data
test_data_7 = test_data_6.join(train_code_2)

In [312]:
#rename column
train_code_1 = train_code_1.rename(columns={'-unknown-': 'unknown_gender', 'FEMALE': 'F_gender', 'MALE': 'M_gender', 'OTHER': 'other_gender'})

In [313]:
#join data
test_data_8 = test_data_7.join(train_code_1)

In [314]:
#check shape of data
train_remove_age_10.head()

Unnamed: 0,id,gender,age,signup_method,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,first_browser,country_destination
1,820tgsjxq7,MALE,38.0,facebook,en,seo,google,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,FEMALE,56.0,basic,en,direct,direct,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,FEMALE,42.0,facebook,en,direct,direct,Web,Mac Desktop,Firefox,other
4,87mebub9p4,-unknown-,41.0,basic,en,direct,direct,Web,Mac Desktop,Chrome,US
6,lsw9q7uk0j,FEMALE,46.0,basic,en,other,craigslist,Web,Mac Desktop,Safari,US


In [315]:
#check shape of test data
test_data_8.head()

Unnamed: 0,AU,CA,DE,ES,FR,GB,IT,NDF,NL,PT,...,th,tr,zh,basic_signup,fb_signup,google_signup,unknown_gender,F_gender,M_gender,other_gender
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [291]:
#check null in test data
train_code_5.isnull().sum()

baidu                  0
bing                   0
craigslist             0
direct                 0
email-marketing        0
facebook               0
facebook-open-graph    0
google                 0
gsp                    0
meetup                 0
naver                  0
other                  0
padmapper              0
vast                   0
wayn                   0
yahoo                  0
yandex                 0
dtype: int64

In [293]:
#rename columns
train_code_4 = train_code_4.rename(columns={'direct': 'direct_ch', 'other': 'other_ch'})

In [298]:
#rename columns
train_code_5 = train_code_5.rename(columns={'other': 'other_me'})

In [318]:
#check shape of data
test_data_8.shape

(122958, 123)

In [459]:
#determine the y and x variables for the model
y = test_data_8.iloc[:,0:12]
X = test_data_8.iloc[:,13:122]

In [457]:
#test train split of the data 
trainX,testX,train_y,test_y = train_test_split(X, y, test_size=0.25)

In [458]:
#import module amd fit model
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier()
model = nn.fit(trainX, train_y)

In [460]:
#print model score
model.score(testX, test_y)

0.31340273259596618

In [461]:
#import module and fit model
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
model_dt = nn.fit(trainX, train_y)

In [462]:
#print model score
model_dt.score(testX, test_y)

0.3592387768379961

In [463]:
#import module and fit model
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=7)
model_kn = kn.fit(trainX, train_y)

In [464]:
#print model score
model_kn.score(testX, test_y)

0.38819128171763173