In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession \
    .builder \
    .appName('Predicting where will a new guest book their first travel experience') \
    .getOrCreate()

<h1>Phase 1: Data preprocessing</h1>

<h3>1) Countries dataset</h3>

<h5>Load and display the data</h5>

In [5]:
countries = spark.read\
            .format('csv')\
            .option('header', 'true')\
            .option('ignoreLeadingWhiteSpace', 'true')\
            .load('./datasets/countries.csv')
countries

DataFrame[country_destination: string, lat_destination: string, lng_destination: string, distance_km: string, destination_km2: string, destination_language : string, language_levenshtein_distance: string]

In [6]:
countries.count()

10

In [7]:
countries.toPandas().head()

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
3,ES,39.896027,-2.4876945,7730.724,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801.0,fra,92.06


<h5>Transform the data type of the columns</h5>
<p>* lat_destination ---> float tyeb</p>
<p>* lng_destination ---> float tyeb</p>
<p>* destination_km ---> float tyeb</p>
<p>* destination_km2 ---> float tyeb</p>
<p>* language_levenshtein_distance ---> float tyeb</p>

In [8]:
from pyspark.sql.types import IntegerType, DateType, FloatType
countries = countries.withColumn("lat_destination", countries["lat_destination"].cast(FloatType()))\
    .withColumn("lng_destination",countries["lng_destination"].cast(FloatType()))\
    .withColumn("distance_km", countries["distance_km"].cast(FloatType()))\
    .withColumn("destination_km2", countries["destination_km2"].cast(FloatType()))\
    .withColumn("language_levenshtein_distance", countries["language_levenshtein_distance"].cast(FloatType()))
countries.printSchema()

root
 |-- country_destination: string (nullable = true)
 |-- lat_destination: float (nullable = true)
 |-- lng_destination: float (nullable = true)
 |-- distance_km: float (nullable = true)
 |-- destination_km2: float (nullable = true)
 |-- destination_language : string (nullable = true)
 |-- language_levenshtein_distance: float (nullable = true)



In [9]:
countries.toPandas().head()

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.275162,15297.744141,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.133301,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.567871,357022.0,deu,72.610001
3,ES,39.896027,-2.487695,7730.724121,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.944824,643801.0,fra,92.059998


<h3>2) Session dataset</h3>

<h5>Load and display the data</h5>

In [10]:
sessions = spark.read\
            .format('csv')\
            .option('header', 'true')\
            .option('ignoreLeadingWhiteSpace', 'true')\
            .load('./datasets/sessions.csv')
sessions

DataFrame[user_id: string, action: string, action_type: string, action_detail: string, device_type: string, secs_elapsed: string]

In [11]:
sessions.count()

10567737

In [12]:
sessions.show(5)

+----------+--------------+-----------+-------------------+---------------+------------+
|   user_id|        action|action_type|      action_detail|    device_type|secs_elapsed|
+----------+--------------+-----------+-------------------+---------------+------------+
|d1mm9tcy42|        lookup|       null|               null|Windows Desktop|       319.0|
|d1mm9tcy42|search_results|      click|view_search_results|Windows Desktop|     67753.0|
|d1mm9tcy42|        lookup|       null|               null|Windows Desktop|       301.0|
|d1mm9tcy42|search_results|      click|view_search_results|Windows Desktop|     22141.0|
|d1mm9tcy42|        lookup|       null|               null|Windows Desktop|       435.0|
+----------+--------------+-----------+-------------------+---------------+------------+
only showing top 5 rows



<h5>Transform the data type of the columns</h5>
<p>* action_type ---> date tyeb</p>
<p>* secs_elapsed ---> float tyeb</p>

In [13]:
sessions = sessions.withColumn("action_type", sessions["action_type"].cast(DateType()))\
    .withColumn("secs_elapsed",sessions["secs_elapsed"].cast(FloatType()))
sessions.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- action: string (nullable = true)
 |-- action_type: date (nullable = true)
 |-- action_detail: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- secs_elapsed: float (nullable = true)



In [14]:
sessions

DataFrame[user_id: string, action: string, action_type: date, action_detail: string, device_type: string, secs_elapsed: float]

In [15]:
sessions.show(5)

+----------+--------------+-----------+-------------------+---------------+------------+
|   user_id|        action|action_type|      action_detail|    device_type|secs_elapsed|
+----------+--------------+-----------+-------------------+---------------+------------+
|d1mm9tcy42|        lookup|       null|               null|Windows Desktop|       319.0|
|d1mm9tcy42|search_results|       null|view_search_results|Windows Desktop|     67753.0|
|d1mm9tcy42|        lookup|       null|               null|Windows Desktop|       301.0|
|d1mm9tcy42|search_results|       null|view_search_results|Windows Desktop|     22141.0|
|d1mm9tcy42|        lookup|       null|               null|Windows Desktop|       435.0|
+----------+--------------+-----------+-------------------+---------------+------------+
only showing top 5 rows



<h5>Calculating the total session time for each user</h5>
<p>The session time counted in secs_elapsed column for each user, so we will count the total session time by user id.<p>

In [16]:
from pyspark.sql.functions import sum as _sum
total_session_time = sessions.groupby("user_id").agg(_sum('secs_elapsed').alias('sum_secs_elapsed'))

In [17]:
total_session_time.toPandas().head()

Unnamed: 0,user_id,sum_secs_elapsed
0,de3scomvop,1051.0
1,9nut71te0s,1659715.0
2,zlv8f1qg2g,1155388.0
3,srykgkylee,246.0
4,funlgmcmr3,54747.0


<h3>3) Training dataset</h3>

<h5>Load and display the data</h5>

In [18]:
train_users = spark.read\
            .format('csv')\
            .option('header', 'true')\
            .option('ignoreLeadingWhiteSpace', 'true')\
            .load('./datasets/train_users.csv')

In [19]:
train_users

DataFrame[id: string, date_account_created: string, timestamp_first_active: string, date_first_booking: string, gender: string, age: string, signup_method: string, signup_flow: string, language: string, affiliate_channel: string, affiliate_provider: string, first_affiliate_tracked: string, signup_app: string, first_device_type: string, first_browser: string, country_destination: string]

In [20]:
train_users.count()

213451

In [21]:
train_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


<h5>Transform the data type of the columns</h5>
<p>* date_account_created ---> date tyeb</p>
<p>* date_first_booking ---> date tyeb</p>
<p>* age ---> float tyeb</p>
<p>* signup_flow ---> float tyeb</p>

In [22]:
from pyspark.sql.types import IntegerType, DateType, FloatType

train_users = train_users.withColumn("date_account_created", train_users["date_account_created"].cast(DateType()))\
    .withColumn("date_first_booking",train_users["date_first_booking"].cast(DateType()))\
    .withColumn("age",train_users["age"].cast(FloatType()))\
    .withColumn('signup_flow', train_users['signup_flow'].cast(FloatType()))
train_users.printSchema()

root
 |-- id: string (nullable = true)
 |-- date_account_created: date (nullable = true)
 |-- timestamp_first_active: string (nullable = true)
 |-- date_first_booking: date (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: float (nullable = true)
 |-- signup_method: string (nullable = true)
 |-- signup_flow: float (nullable = true)
 |-- language: string (nullable = true)
 |-- affiliate_channel: string (nullable = true)
 |-- affiliate_provider: string (nullable = true)
 |-- first_affiliate_tracked: string (nullable = true)
 |-- signup_app: string (nullable = true)
 |-- first_device_type: string (nullable = true)
 |-- first_browser: string (nullable = true)
 |-- country_destination: string (nullable = true)



In [23]:
train_users.describe()

DataFrame[summary: string, id: string, timestamp_first_active: string, gender: string, age: string, signup_method: string, signup_flow: string, language: string, affiliate_channel: string, affiliate_provider: string, first_affiliate_tracked: string, signup_app: string, first_device_type: string, first_browser: string, country_destination: string]

In [24]:
train_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0.0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0.0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3.0,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0.0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0.0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


<h5>Convert gender from uppercase letter to lower case letter</h5>

In [25]:
from pyspark.sql.functions import year, month, lower, col
train_users = train_users.withColumn("gender", lower(col('gender')))

<h5>Droping unneeded columns</h5>

In [26]:
drop = ['date_first_booking', 'first_affiliate_tracked', 'first_browser']

train_users = train_users.select([column for column in train_users.columns if column not in drop])

In [27]:
train_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,-unknown-,,facebook,0.0,en,direct,direct,Web,Mac Desktop,NDF
1,820tgsjxq7,2011-05-25,20090523174809,male,38.0,facebook,0.0,en,seo,google,Web,Mac Desktop,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,female,56.0,basic,3.0,en,direct,direct,Web,Windows Desktop,US
3,bjjt8pjhuk,2011-12-05,20091031060129,female,42.0,facebook,0.0,en,direct,direct,Web,Mac Desktop,other
4,87mebub9p4,2010-09-14,20091208061105,-unknown-,41.0,basic,0.0,en,direct,direct,Web,Mac Desktop,US


<h5>Adding the total Session time we calculated above to the training dataset</h5>

In [28]:
train_users = train_users.join(total_session_time, train_users["id"] == total_session_time["user_id"],how='left_outer').select(train_users["*"],total_session_time["sum_secs_elapsed"])


In [29]:
train_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,country_destination,sum_secs_elapsed
0,01r3iatdvv,2014-02-11,20140211202128,-unknown-,,basic,0.0,en,direct,direct,Web,Mac Desktop,US,813485.0
1,02sgboyndc,2014-04-29,20140429213851,-unknown-,,basic,0.0,en,sem-brand,google,Web,Mac Desktop,NDF,758902.0
2,03c7ihv5r8,2014-05-07,20140507223046,male,47.0,facebook,0.0,en,sem-brand,google,Web,Windows Desktop,NDF,3003287.0
3,05xkkfxs5v,2014-04-06,20140406183244,male,26.0,facebook,0.0,en,remarketing,google,Web,Mac Desktop,NDF,106376.0
4,08bys9zpkj,2013-07-04,20130704223816,female,50.0,facebook,0.0,en,sem-non-brand,google,Web,Mac Desktop,NDF,


<h5>Calculate month of account creation</h5>

In [30]:
train_users = train_users.withColumn("month_of_creation", month("date_account_created"))

In [31]:
train_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,country_destination,sum_secs_elapsed,month_of_creation
0,01r3iatdvv,2014-02-11,20140211202128,-unknown-,,basic,0.0,en,direct,direct,Web,Mac Desktop,US,813485.0,2
1,02sgboyndc,2014-04-29,20140429213851,-unknown-,,basic,0.0,en,sem-brand,google,Web,Mac Desktop,NDF,758902.0,4
2,03c7ihv5r8,2014-05-07,20140507223046,male,47.0,facebook,0.0,en,sem-brand,google,Web,Windows Desktop,NDF,3003287.0,5
3,05xkkfxs5v,2014-04-06,20140406183244,male,26.0,facebook,0.0,en,remarketing,google,Web,Mac Desktop,NDF,106376.0,4
4,08bys9zpkj,2013-07-04,20130704223816,female,50.0,facebook,0.0,en,sem-non-brand,google,Web,Mac Desktop,NDF,,7


<h5>Drop date columns</h5>

In [32]:
drop = ['date_account_created', 'timestamp_first_active']

train_users = train_users.select([column for column in train_users.columns if column not in drop])

In [33]:
train_users.toPandas().head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,country_destination,sum_secs_elapsed,month_of_creation
0,01r3iatdvv,-unknown-,,basic,0.0,en,direct,direct,Web,Mac Desktop,US,813485.0,2
1,02sgboyndc,-unknown-,,basic,0.0,en,sem-brand,google,Web,Mac Desktop,NDF,758902.0,4
2,03c7ihv5r8,male,47.0,facebook,0.0,en,sem-brand,google,Web,Windows Desktop,NDF,3003287.0,5
3,05xkkfxs5v,male,26.0,facebook,0.0,en,remarketing,google,Web,Mac Desktop,NDF,106376.0,4
4,08bys9zpkj,female,50.0,facebook,0.0,en,sem-non-brand,google,Web,Mac Desktop,NDF,,7


<h5>Find categoral columns</h5>

In [34]:
cat_cols = [item[0] for item in train_users.dtypes if item[1].startswith('string')] 
cat_cols

['id',
 'gender',
 'signup_method',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'signup_app',
 'first_device_type',
 'country_destination']

<p>Excluding 'country_destination' ----> our labels</p>

In [35]:
cat_cols.pop(-1)
cat_cols

['id',
 'gender',
 'signup_method',
 'language',
 'affiliate_channel',
 'affiliate_provider',
 'signup_app',
 'first_device_type']

<h5>Find numerical columns</h5>

In [36]:
num_cols = [item[0] for item in train_users.dtypes if item[1].startswith('int') | item[1].startswith('double') | item[1].startswith('float')] 
num_cols

['age', 'signup_flow', 'sum_secs_elapsed', 'month_of_creation']

<h5>Find which columns contain null or unknown values</h5>

In [37]:
cat_null_cols = [column for column in cat_cols if train_users.where(col(column).isNull()).count() > 0]
cat_null_cols

[]

In [38]:
from pyspark.sql.functions import lit
num_null_cols = [column for column in num_cols if train_users.filter(col(column).isNull() | col(column).eqNullSafe(0)).count() > 0]
num_null_cols

['age', 'signup_flow', 'sum_secs_elapsed']

In [39]:
train_users = train_users.fillna(0, subset=['age'])
train_users =  train_users.fillna(0, subset=['sum_secs_elapsed'])
train_users = train_users.fillna(0, subset=['signup_flow'])

In [40]:
train_users.toPandas().head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,country_destination,sum_secs_elapsed,month_of_creation
0,01r3iatdvv,-unknown-,0.0,basic,0.0,en,direct,direct,Web,Mac Desktop,US,813485.0,2
1,02sgboyndc,-unknown-,0.0,basic,0.0,en,sem-brand,google,Web,Mac Desktop,NDF,758902.0,4
2,03c7ihv5r8,male,47.0,facebook,0.0,en,sem-brand,google,Web,Windows Desktop,NDF,3003287.0,5
3,05xkkfxs5v,male,26.0,facebook,0.0,en,remarketing,google,Web,Mac Desktop,NDF,106376.0,4
4,08bys9zpkj,female,50.0,facebook,0.0,en,sem-non-brand,google,Web,Mac Desktop,NDF,0.0,7


<h5>Transform categorical fields</h5>
<p>First use StringIndexer to convert categorical values to indices</p>

In [41]:
from pyspark.ml.feature import StringIndexer
indexers = [StringIndexer(inputCol=column,
                          outputCol=column + '_indexed', 
                          handleInvalid='keep') for column in cat_cols]
type(indexers)

list

<h5>OneHotEncoding</h5>

In [42]:
'''from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCols=[column+'_indexed' for column in cat_cols], 
                                 outputCols=[column+'_encoded' for column in cat_cols])
model = encoder.fit(indexers)
encoded = model.transform(indexers)'''

"from pyspark.ml.feature import OneHotEncoder\n\nencoder = OneHotEncoder(inputCols=[column+'_indexed' for column in cat_cols], \n                                 outputCols=[column+'_encoded' for column in cat_cols])\nmodel = encoder.fit(indexers)\nencoded = model.transform(indexers)"

In [43]:
from pyspark.ml.feature import OneHotEncoder

encoders = [OneHotEncoder(
    inputCol= column + '_index', 
    outputCol= column + '_encoded') for column in cat_cols if column not in ['Label']]

In [44]:
type(encoders)

list

<h5>Create a pipeline</h5>

In [45]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexers + [encoders])

<h5>View the result of the transformations performed by this pipeline</h5>

In [46]:
'''transformedDF = pipeline.fit(train_users).transform(train_users)'''

#train_users = pipeline.transform(train_users)

'transformedDF = pipeline.fit(train_users).transform(train_users)'

In [47]:
'''transformedDF.toPandas().head()

train_users = train_users.drop(*[column+'_indexed' for column in cat_cols])
train_users.fillna(0.0)

train_users.toPandas().head()'''

"transformedDF.toPandas().head()\n\ntrain_users = train_users.drop(*[column+'_indexed' for column in cat_cols])\ntrain_users.fillna(0.0)\n\ntrain_users.toPandas().head()"

<h5>Take a sample of data</h5>

In [48]:
#(sample, data) = train_users.randomSplit([0.2,0.7])
(sample, data) = train_users.randomSplit([0.01,0.99])

In [49]:
print(sample.count())
print(train_users.count())

2137
213451


In [50]:
sample.toPandas().head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,country_destination,sum_secs_elapsed,month_of_creation
0,1iaydx07yy,female,29.0,facebook,12.0,en,api,other,Moweb,iPhone,US,0.0,2
1,fde35io9q4,-unknown-,0.0,basic,0.0,en,seo,google,Web,Mac Desktop,NDF,2337422.0,6
2,mom7gx8cht,-unknown-,0.0,basic,12.0,en,api,other,Web,Windows Desktop,NDF,0.0,7
3,mufb3oqq1u,-unknown-,0.0,basic,0.0,en,other,padmapper,Web,Mac Desktop,NDF,0.0,10
4,pwweotgcx3,male,25.0,facebook,12.0,en,api,other,iOS,iPhone,NDF,291.0,1


<h5>Bulding a pipeline for sampled data</h5>

In [51]:
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(
    inputCol=column,
    outputCol=column + '_indexed', 
    handleInvalid='keep') for column in cat_cols]

In [52]:
indexers

[StringIndexer_cdd25ef412bf,
 StringIndexer_9a9d7e61754d,
 StringIndexer_95c857d199d8,
 StringIndexer_8774a99b9397,
 StringIndexer_47db257ef767,
 StringIndexer_5fedefe4365b,
 StringIndexer_07fe715a01b0,
 StringIndexer_f073c103dc62]

In [53]:
from pyspark.ml.feature import OneHotEncoder

'''encoder = OneHotEncoder(
    inputCols=[column+'_indexed' for column in cat_cols], 
    outputCols=[column+'_encoded' for column in cat_cols])transform(indexedDF)'''

'''encoders = [OneHotEncoder(
    inputCol=column + '_indexed', 
    outputCol= column + '_encoded') for column in cat_cols if column not in ['Label']]'''




encoders = OneHotEncoder(
    inputCols=[column+'_indexed' for column in cat_cols], 
    outputCols=[column+'_encoded' for column in cat_cols])

In [54]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = indexers + [encoders])
'''
stages = indexers + [encoders]
stages.append(encoders)
#stages.append(dtc_model)
pipeline = Pipeline().setStages(stages)'''



'\nstages = indexers + [encoders]\nstages.append(encoders)\n#stages.append(dtc_model)\npipeline = Pipeline().setStages(stages)'

In [55]:
#transformedDF = pipeline.fit(sample).transform(sample)

pipeline = pipeline.fit(sample)
sample = pipeline.transform(sample)

In [56]:
#sample.toPandas().head()

In [57]:
sample = sample.drop(*[column+'_indexed' for column in cat_cols])
sample.fillna(0.0)

DataFrame[id: string, gender: string, age: float, signup_method: string, signup_flow: float, language: string, affiliate_channel: string, affiliate_provider: string, signup_app: string, first_device_type: string, country_destination: string, sum_secs_elapsed: double, month_of_creation: int, affiliate_channel_encoded: vector, gender_encoded: vector, id_encoded: vector, signup_method_encoded: vector, affiliate_provider_encoded: vector, first_device_type_encoded: vector, signup_app_encoded: vector, language_encoded: vector]

In [58]:
##sample.toPandas().head()
sample.printSchema()

root
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: float (nullable = false)
 |-- signup_method: string (nullable = true)
 |-- signup_flow: float (nullable = false)
 |-- language: string (nullable = true)
 |-- affiliate_channel: string (nullable = true)
 |-- affiliate_provider: string (nullable = true)
 |-- signup_app: string (nullable = true)
 |-- first_device_type: string (nullable = true)
 |-- country_destination: string (nullable = true)
 |-- sum_secs_elapsed: double (nullable = false)
 |-- month_of_creation: integer (nullable = true)
 |-- affiliate_channel_encoded: vector (nullable = true)
 |-- gender_encoded: vector (nullable = true)
 |-- id_encoded: vector (nullable = true)
 |-- signup_method_encoded: vector (nullable = true)
 |-- affiliate_provider_encoded: vector (nullable = true)
 |-- first_device_type_encoded: vector (nullable = true)
 |-- signup_app_encoded: vector (nullable = true)
 |-- language_encoded: vector (nullable = true)



<h5>VectorAssembler</h5>
<p>Combines a given list of columns into a single vector column.</p>

In [59]:
from pyspark.ml.feature import VectorAssembler

#sample = sample.withColumn("age", sample["age"].cast(FloatType()))

assembler = VectorAssembler(inputCols=[i for i in sample.columns if (i !='country_destination') and (i not in cat_cols)], outputCol='features')
sample = assembler.transform(sample)

In [60]:
#sample.toPandas().head()
#sample.printSchema()

In [61]:
sample.select('features').toPandas().head()

Unnamed: 0,features
0,"(29.0, 12.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
1,"(0.0, 0.0, 2337422.0, 6.0, 0.0, 0.0, 0.0, 0.0,..."
2,"(0.0, 12.0, 0.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,"(0.0, 0.0, 0.0, 10.0, 0.0, 0.0, 0.0, 1.0, 0.0,..."
4,"(25.0, 12.0, 291.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0..."


<h5>Make a labeld dataframe</h5>

In [62]:
sample.printSchema()

root
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: float (nullable = false)
 |-- signup_method: string (nullable = true)
 |-- signup_flow: float (nullable = false)
 |-- language: string (nullable = true)
 |-- affiliate_channel: string (nullable = true)
 |-- affiliate_provider: string (nullable = true)
 |-- signup_app: string (nullable = true)
 |-- first_device_type: string (nullable = true)
 |-- country_destination: string (nullable = true)
 |-- sum_secs_elapsed: double (nullable = false)
 |-- month_of_creation: integer (nullable = true)
 |-- affiliate_channel_encoded: vector (nullable = true)
 |-- gender_encoded: vector (nullable = true)
 |-- id_encoded: vector (nullable = true)
 |-- signup_method_encoded: vector (nullable = true)
 |-- affiliate_provider_encoded: vector (nullable = true)
 |-- first_device_type_encoded: vector (nullable = true)
 |-- signup_app_encoded: vector (nullable = true)
 |-- language_encoded: vector (nullable = true)
 |-- feat

In [63]:
indexer = StringIndexer(inputCol='country_destination', outputCol='label')
sample = indexer.fit(sample).transform(sample)

'''indexer = StringIndexer(inputCol='country_destination', outputCol='label').fit(sample).transform(sample)
'''
labels_col = sorted(set([(i[0], i[1]) for i in sample.select(sample.country_destination, sample.label).collect()]), key=lambda x: x[0])
labels_col = {int(i[1]):i[0] for i in labels_col}

In [64]:
labels_col

{11: 'AU',
 7: 'CA',
 8: 'DE',
 4: 'ES',
 3: 'FR',
 6: 'GB',
 5: 'IT',
 0: 'NDF',
 9: 'NL',
 10: 'PT',
 1: 'US',
 2: 'other'}

In [65]:
#sample.select('features', 'label').toPandas().head()

In [66]:
sample.select('features', 'label').show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(2197,[0,1,3,9,13...|  1.0|
|(2197,[2,3,8,12,9...|  0.0|
|(2197,[1,3,9,12,1...|  0.0|
|(2197,[3,7,12,136...|  0.0|
|(2197,[0,1,2,3,9,...|  0.0|
|(2197,[0,2,3,8,13...|  0.0|
|(2197,[0,3,4,14,2...|  1.0|
|(2197,[1,3,10,12,...|  3.0|
|(2197,[0,3,6,13,4...|  0.0|
|(2197,[0,3,4,12,2...|  1.0|
|(2197,[0,1,3,4,14...|  0.0|
|(2197,[0,3,7,13,4...|  1.0|
|(2197,[0,3,6,13,5...|  0.0|
|(2197,[0,3,4,13,8...|  1.0|
|(2197,[0,2,3,6,13...|  1.0|
|(2197,[0,3,4,13,1...|  1.0|
|(2197,[0,2,3,5,14...|  1.0|
|(2197,[0,2,3,8,12...|  1.0|
|(2197,[0,1,3,7,13...|  0.0|
|(2197,[1,2,3,4,12...|  0.0|
+--------------------+-----+
only showing top 20 rows



<h3>4) Test dataset</h3>
<p>Repeating the same processes done with training dataset</p>

In [96]:
test_users = spark.read\
                .format('csv')\
                .option('header', 'true')\
                .option('ignoreLeadingWhiteSpace', 'true')\
                .load('./datasets/test_users.csv')

In [97]:
test_users

DataFrame[id: string, date_account_created: string, timestamp_first_active: string, date_first_booking: string, gender: string, age: string, signup_method: string, signup_flow: string, language: string, affiliate_channel: string, affiliate_provider: string, first_affiliate_tracked: string, signup_app: string, first_device_type: string, first_browser: string]

In [98]:
test_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [99]:
test_users.count()

62096

<h5>Take a sample from test dataset</h5>

In [100]:
test_users, test2 = test_users.randomSplit([0.01, 0.99])

In [101]:
test_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,00ycgxqszj,2014-07-11,20140711061100,,-unknown-,,basic,0,en,direct,direct,linked,Moweb,iPhone,Mobile Safari
1,05fqwcqyte,2014-07-29,20140729213239,,FEMALE,39.0,facebook,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari
2,07fqufoai5,2014-07-02,20140702041329,,-unknown-,,basic,0,en,seo,facebook,linked,Web,Windows Desktop,Chrome
3,0acd983g4u,2014-07-28,20140728025754,,-unknown-,39.0,basic,0,en,direct,direct,omg,Moweb,Mac Desktop,Safari
4,0bxts5vxg3,2014-08-22,20140822161233,,-unknown-,,basic,0,en,seo,google,linked,Web,Windows Desktop,Chrome


In [102]:
test_users.count()

622

In [103]:
test_users = test_users.withColumn("date_account_created", test_users["date_account_created"].cast(DateType()))\
    .withColumn("date_first_booking",test_users["date_first_booking"].cast(DateType()))\
    .withColumn("age",test_users["age"].cast(FloatType()))\
    .withColumn('signup_flow', test_users['signup_flow'].cast(FloatType()))
test_users.printSchema()

root
 |-- id: string (nullable = true)
 |-- date_account_created: date (nullable = true)
 |-- timestamp_first_active: string (nullable = true)
 |-- date_first_booking: date (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: float (nullable = true)
 |-- signup_method: string (nullable = true)
 |-- signup_flow: float (nullable = true)
 |-- language: string (nullable = true)
 |-- affiliate_channel: string (nullable = true)
 |-- affiliate_provider: string (nullable = true)
 |-- first_affiliate_tracked: string (nullable = true)
 |-- signup_app: string (nullable = true)
 |-- first_device_type: string (nullable = true)
 |-- first_browser: string (nullable = true)



In [104]:
test_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,00ycgxqszj,2014-07-11,20140711061100,,-unknown-,,basic,0.0,en,direct,direct,linked,Moweb,iPhone,Mobile Safari
1,05fqwcqyte,2014-07-29,20140729213239,,FEMALE,39.0,facebook,0.0,en,sem-brand,google,omg,Web,Mac Desktop,Safari
2,07fqufoai5,2014-07-02,20140702041329,,-unknown-,,basic,0.0,en,seo,facebook,linked,Web,Windows Desktop,Chrome
3,0acd983g4u,2014-07-28,20140728025754,,-unknown-,39.0,basic,0.0,en,direct,direct,omg,Moweb,Mac Desktop,Safari
4,0bxts5vxg3,2014-08-22,20140822161233,,-unknown-,,basic,0.0,en,seo,google,linked,Web,Windows Desktop,Chrome


<h5>Calculating total session time and adding it to  test_users dataset. Then converting gender column data to lower case</h5>

In [105]:
test_users = test_users.join(total_session_time, test_users["id"] == total_session_time["user_id"],how='left_outer').select(test_users["*"],total_session_time["sum_secs_elapsed"])
test_users = test_users.withColumn("month_account_created", month("date_account_created"))
test_users = test_users.withColumn("gender", lower(col('gender')))

In [106]:
test_users.toPandas().head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,sum_secs_elapsed,month_account_created
0,mmuf5hdp18,2014-07-31,20140731043546,,-unknown-,,basic,0.0,en,sem-brand,google,omg,Web,Windows Desktop,IE,563512.0,7
1,z4s8hpuzhk,2014-09-23,20140923030808,,-unknown-,,basic,25.0,en,direct,direct,untracked,iOS,iPhone,-unknown-,845338.0,9
2,4kltl2mys3,2014-07-06,20140706202540,,male,47.0,facebook,0.0,en,seo,facebook,untracked,Moweb,iPhone,Mobile Safari,91111.0,7
3,urxh9pscd4,2014-07-02,20140702185608,,-unknown-,,basic,25.0,en,sem-brand,bing,omg,iOS,iPhone,Mobile Safari,9163372.0,7
4,07fqufoai5,2014-07-02,20140702041329,,-unknown-,,basic,0.0,en,seo,facebook,linked,Web,Windows Desktop,Chrome,415133.0,7


<h5>Drop unneeded columns</h5>

In [107]:
drop = ['date_account_created', 'timestamp_first_active', 'date_first_booking', 'first_affiliate_tracked', 'first_browser']
test_users = test_users.select([column for column in test_users.columns if column not in drop])

#test_users = test_users.select([column for column in test_users.columns if column not in drop])

<h5>Fill null values</h5>

In [108]:
test_users = test_users.fillna(0, subset=['age'])
test_users = test_users.fillna(0, subset=['sum_secs_elapsed'])
#train_users = train_users.fillna(0, subset=['signup_flow'])

In [109]:
#test_users.toPandas().head()

<h5>Using created pipeline to tansform test dataset into a format that can be used by the mode</h5>

In [110]:
test_users = pipeline.transform(test_users)

In [111]:
test_users = test_users.drop(*[column+'_indexed' for column in cat_cols])

In [112]:
test_users.fillna(0.0)
#test_users.toPandas().head()

DataFrame[id: string, gender: string, age: float, signup_method: string, signup_flow: float, language: string, affiliate_channel: string, affiliate_provider: string, signup_app: string, first_device_type: string, sum_secs_elapsed: double, month_account_created: int, affiliate_channel_encoded: vector, gender_encoded: vector, id_encoded: vector, signup_method_encoded: vector, affiliate_provider_encoded: vector, first_device_type_encoded: vector, signup_app_encoded: vector, language_encoded: vector]

In [113]:
test_users.na.fill(0)

DataFrame[id: string, gender: string, age: float, signup_method: string, signup_flow: float, language: string, affiliate_channel: string, affiliate_provider: string, signup_app: string, first_device_type: string, sum_secs_elapsed: double, month_account_created: int, affiliate_channel_encoded: vector, gender_encoded: vector, id_encoded: vector, signup_method_encoded: vector, affiliate_provider_encoded: vector, first_device_type_encoded: vector, signup_app_encoded: vector, language_encoded: vector]

In [114]:
#test_users.show(3)

<h5>Using VectorAssembler to combine columns into a single vector column</h5>

In [115]:
'''assembler = VectorAssembler(inputCols=[i for i in test_users.columns if i not in cat_cols], outputCol='features')
test_users = assembler.transform(test_users)'''


assembler = VectorAssembler(inputCols=[i for i in test_users.columns if (i !='country_destination') and (i not in cat_cols)], outputCol='features')
test_users = assembler.transform(test_users)

In [116]:
'''test_users.select('features').show()'''

test_users.select('features').toPandas().head()

Unnamed: 0,features
0,"(0.0, 0.0, 563512.0, 7.0, 0.0, 1.0, 0.0, 0.0, ..."
1,"(0.0, 25.0, 845338.0, 9.0, 1.0, 0.0, 0.0, 0.0,..."
2,"(47.0, 0.0, 91111.0, 7.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"(0.0, 25.0, 9163372.0, 7.0, 0.0, 1.0, 0.0, 0.0..."
4,"(0.0, 0.0, 415133.0, 7.0, 0.0, 0.0, 0.0, 0.0, ..."


In [117]:
test_users.printSchema()

root
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: float (nullable = false)
 |-- signup_method: string (nullable = true)
 |-- signup_flow: float (nullable = true)
 |-- language: string (nullable = true)
 |-- affiliate_channel: string (nullable = true)
 |-- affiliate_provider: string (nullable = true)
 |-- signup_app: string (nullable = true)
 |-- first_device_type: string (nullable = true)
 |-- sum_secs_elapsed: double (nullable = false)
 |-- month_account_created: integer (nullable = true)
 |-- affiliate_channel_encoded: vector (nullable = true)
 |-- gender_encoded: vector (nullable = true)
 |-- id_encoded: vector (nullable = true)
 |-- signup_method_encoded: vector (nullable = true)
 |-- affiliate_provider_encoded: vector (nullable = true)
 |-- first_device_type_encoded: vector (nullable = true)
 |-- signup_app_encoded: vector (nullable = true)
 |-- language_encoded: vector (nullable = true)
 |-- features: vector (nullable = true)



In [120]:
test_users.select('features').show()

+--------------------+
|            features|
+--------------------+
|(2197,[2,3,5,12,2...|
|(2197,[1,2,3,4,12...|
|(2197,[0,2,3,8,14...|
|(2197,[1,2,3,5,12...|
|(2197,[2,3,8,12,2...|
|(2197,[0,2,3,5,12...|
|(2197,[2,3,4,12,2...|
|(2197,[0,2,3,4,12...|
|(2197,[0,2,3,5,13...|
|(2197,[0,1,2,3,4,...|
|(2197,[0,2,3,4,13...|
|(2197,[2,3,5,12,2...|
|(2197,[1,2,3,4,12...|
|(2197,[0,2,3,4,14...|
|(2197,[1,2,3,4,12...|
|(2197,[2,3,8,12,2...|
|(2197,[2,3,4,12,2...|
|(2197,[0,1,2,3,4,...|
|(2197,[0,2,3,8,12...|
|(2197,[2,3,8,12,2...|
+--------------------+
only showing top 20 rows



<h1>Phase 2: ML model training</h1>
<p>Using Dession tree classifier</p>

<h5>Split sample dataset into training data and validation data</h5>

In [145]:
(trainingData, validationData) = sample.randomSplit([0.8,0.2])
'''trainingData = sample
validationData = test_users'''

'trainingData = sample\nvalidationData = test_users'

<h5>ML model creation</h5>

In [146]:
from pyspark.ml.classification import DecisionTreeClassifier

model = DecisionTreeClassifier(labelCol="label", featuresCol="features")
model = model.fit(trainingData)

In [147]:
predictions = model.transform(validationData)

In [148]:
predictionsDF = predictions.toPandas()

In [149]:
#predictions.select('prediction', 'probability').show(5)
predictionsDF.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,...,signup_method_encoded,affiliate_provider_encoded,first_device_type_encoded,signup_app_encoded,language_encoded,features,label,rawPrediction,probability,prediction
0,mom7gx8cht,-unknown-,0.0,basic,12.0,en,api,other,Web,Windows Desktop,...,"(1.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 12.0, 0.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,"[21.0, 11.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.6, 0.3142857142857143, 0.08571428571428572,...",0.0
1,zef7h0tdip,-unknown-,0.0,basic,3.0,en,content,google,Web,Other/Unknown,...,"(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 3.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.0,"[21.0, 11.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.6, 0.3142857142857143, 0.08571428571428572,...",0.0
2,70xly0407d,male,42.0,basic,1.0,en,direct,direct,Web,Mac Desktop,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(42.0, 1.0, 0.0, 8.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",0.0,"[63.0, 52.0, 9.0, 6.0, 2.0, 2.0, 3.0, 2.0, 1.0...","[0.44366197183098594, 0.36619718309859156, 0.0...",0.0
3,nlp8wvf631,female,25.0,basic,0.0,en,direct,direct,Web,Mac Desktop,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(25.0, 0.0, 0.0, 7.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",1.0,"[39.0, 35.0, 1.0, 4.0, 2.0, 0.0, 1.0, 2.0, 0.0...","[0.4588235294117647, 0.4117647058823529, 0.011...",0.0
4,2gnj4zkcta,-unknown-,0.0,basic,0.0,en,direct,direct,Web,iPad,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 12.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",0.0,"[212.0, 52.0, 20.0, 11.0, 3.0, 3.0, 1.0, 2.0, ...","[0.6773162939297125, 0.16613418530351437, 0.06...",0.0


In [150]:
#predictions.head()

<h5>Predicted y and real y</h5>

In [151]:
import numpy as np

y_predict = predictions.toPandas().probability.apply(lambda x : np.array(x.toArray())).to_numpy()
y_true = [int(row.label) for row in trainingData.select('label').collect()]

<h5>Create an evaluator for our model<h5/>

In [152]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol='label', 
    predictionCol='prediction', 
    metricName='accuracy')

<h5>Model Accuracy </h5>

In [153]:
accuracy = evaluator.evaluate(predictions)
print('Test Accuracy = ', accuracy)

Test Accuracy =  0.6121951219512195


In [155]:
predictionsDF.loc[
    predictionsDF['label'] != predictionsDF['prediction']
]

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,...,signup_method_encoded,affiliate_provider_encoded,first_device_type_encoded,signup_app_encoded,language_encoded,features,label,rawPrediction,probability,prediction
1,zef7h0tdip,-unknown-,0.0,basic,3.0,en,content,google,Web,Other/Unknown,...,"(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 3.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.0,"[21.0, 11.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.6, 0.3142857142857143, 0.08571428571428572,...",0.0
3,nlp8wvf631,female,25.0,basic,0.0,en,direct,direct,Web,Mac Desktop,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(25.0, 0.0, 0.0, 7.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",1.0,"[39.0, 35.0, 1.0, 4.0, 2.0, 0.0, 1.0, 2.0, 0.0...","[0.4588235294117647, 0.4117647058823529, 0.011...",0.0
5,46ipvnmlrt,-unknown-,27.0,basic,0.0,en,sem-non-brand,bing,Web,Windows Desktop,...,"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(27.0, 0.0, 0.0, 10.0, 0.0, 0.0, 1.0, 0.0, 0.0...",2.0,"[52.0, 105.0, 9.0, 1.0, 6.0, 3.0, 2.0, 2.0, 0....","[0.2857142857142857, 0.5769230769230769, 0.049...",1.0
8,nob3onuzbx,-unknown-,0.0,basic,0.0,en,sem-brand,google,Web,Mac Desktop,...,"(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 5693565.0, 3.0, 0.0, 1.0, 0.0, 0.0,...",1.0,"[160.0, 25.0, 4.0, 3.0, 2.0, 0.0, 0.0, 1.0, 0....","[0.8205128205128205, 0.1282051282051282, 0.020...",0.0
9,78v20tnmv0,-unknown-,22.0,basic,0.0,en,seo,facebook,Web,Windows Desktop,...,"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(22.0, 0.0, 214141.0, 2.0, 0.0, 0.0, 0.0, 0.0,...",5.0,"[39.0, 35.0, 1.0, 4.0, 2.0, 0.0, 1.0, 2.0, 0.0...","[0.4588235294117647, 0.4117647058823529, 0.011...",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,xclxubnoyl,-unknown-,0.0,basic,0.0,en,direct,direct,Web,Mac Desktop,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 175181.0, 5.0, 1.0, 0.0, 0.0, 0.0, ...",1.0,"[212.0, 52.0, 20.0, 11.0, 3.0, 3.0, 1.0, 2.0, ...","[0.6773162939297125, 0.16613418530351437, 0.06...",0.0
401,j5xtww69sm,-unknown-,24.0,basic,25.0,en,direct,direct,iOS,Mac Desktop,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(24.0, 25.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0...",1.0,"[39.0, 35.0, 1.0, 4.0, 2.0, 0.0, 1.0, 2.0, 0.0...","[0.4588235294117647, 0.4117647058823529, 0.011...",0.0
404,aoqp5jexio,male,30.0,facebook,0.0,en,direct,direct,Web,Mac Desktop,...,"(0.0, 1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(30.0, 0.0, 0.0, 11.0, 1.0, 0.0, 0.0, 0.0, 0.0...",1.0,"[186.0, 100.0, 16.0, 7.0, 2.0, 3.0, 1.0, 2.0, ...","[0.577639751552795, 0.3105590062111801, 0.0496...",0.0
407,iz21xq5i1e,-unknown-,63.0,basic,0.0,en,sem-non-brand,google,Web,Windows Desktop,...,"(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(63.0, 0.0, 0.0, 12.0, 0.0, 0.0, 1.0, 0.0, 0.0...",2.0,"[41.0, 64.0, 6.0, 5.0, 1.0, 3.0, 3.0, 1.0, 1.0...","[0.3228346456692913, 0.5039370078740157, 0.047...",1.0


<h5>Final prediction</h5>

In [156]:
predictionsDF.head()

Unnamed: 0,id,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,signup_app,first_device_type,...,signup_method_encoded,affiliate_provider_encoded,first_device_type_encoded,signup_app_encoded,language_encoded,features,label,rawPrediction,probability,prediction
0,mom7gx8cht,-unknown-,0.0,basic,12.0,en,api,other,Web,Windows Desktop,...,"(1.0, 0.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 12.0, 0.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,"[21.0, 11.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.6, 0.3142857142857143, 0.08571428571428572,...",0.0
1,zef7h0tdip,-unknown-,0.0,basic,3.0,en,content,google,Web,Other/Unknown,...,"(1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 3.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3.0,"[21.0, 11.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.6, 0.3142857142857143, 0.08571428571428572,...",0.0
2,70xly0407d,male,42.0,basic,1.0,en,direct,direct,Web,Mac Desktop,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(42.0, 1.0, 0.0, 8.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",0.0,"[63.0, 52.0, 9.0, 6.0, 2.0, 2.0, 3.0, 2.0, 1.0...","[0.44366197183098594, 0.36619718309859156, 0.0...",0.0
3,nlp8wvf631,female,25.0,basic,0.0,en,direct,direct,Web,Mac Desktop,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(25.0, 0.0, 0.0, 7.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",1.0,"[39.0, 35.0, 1.0, 4.0, 2.0, 0.0, 1.0, 2.0, 0.0...","[0.4588235294117647, 0.4117647058823529, 0.011...",0.0
4,2gnj4zkcta,-unknown-,0.0,basic,0.0,en,direct,direct,Web,iPad,...,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 12.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",0.0,"[212.0, 52.0, 20.0, 11.0, 3.0, 3.0, 1.0, 2.0, ...","[0.6773162939297125, 0.16613418530351437, 0.06...",0.0
