In [1]:
from pyspark.sql import SparkSession

In [2]:
ss = SparkSession.builder.appName("kickstarter").getOrCreate()

In [3]:
df = ss.read.parquet("../data/kickstarter.parquet")

Getting useful columns

In [4]:
columns = df.columns
list(enumerate(df.columns))

[(0, 'project_id'),
 (1, 'name'),
 (2, 'desc'),
 (3, 'goal'),
 (4, 'keywords'),
 (5, 'final_status'),
 (6, 'country_clean'),
 (7, 'currency_clean'),
 (8, 'deadline_clean'),
 (9, 'created_at_clean'),
 (10, 'launched_at_clean'),
 (11, 'days_campaign'),
 (12, 'hours_prepa')]

In [5]:
clf_columns = [columns[6],columns[7],columns[8],columns[9],columns[10],columns[11],columns[12]]

In [6]:
df.select(clf_columns).show(5)

+-------------+--------------+--------------+----------------+-----------------+-------------+-----------+
|country_clean|currency_clean|deadline_clean|created_at_clean|launched_at_clean|days_campaign|hours_prepa|
+-------------+--------------+--------------+----------------+-----------------+-------------+-----------+
|           US|           USD|    2010-07-31|      2010-06-14|       2010-06-16|           45|      29.69|
|           US|           USD|    2011-09-07|      2011-08-06|       2011-08-08|           30|      62.02|
|           US|           USD|    2011-09-29|      2011-08-24|       2011-08-30|           30|      143.8|
|           US|           USD|    2011-05-01|      2011-03-22|       2011-03-22|           40|       1.99|
|           US|           USD|    2011-05-10|      2011-02-02|       2011-02-08|           91|     134.81|
+-------------+--------------+--------------+----------------+-----------------+-------------+-----------+
only showing top 5 rows



Fearuee engineering

In [7]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [8]:
indexer_country = StringIndexer(inputCol=columns[6], outputCol=columns[6]+"_index").fit(df)
indexer_currency = StringIndexer(inputCol=columns[7], outputCol=columns[7]+"_index").fit(df)

In [9]:
df = indexer_country.transform(df)
df = indexer_currency.transform(df)

In [10]:
encoder_country = OneHotEncoder(inputCol=columns[6]+"_index", outputCol=columns[6]+"_vector")
encoder_currency = OneHotEncoder(inputCol=columns[7]+"_index", outputCol=columns[7]+"_vector")

In [11]:
df = encoder_country.transform(df)
df = encoder_currency.transform(df)

In [18]:
f_columns = [columns[6]+"_vector", columns[7]+"_vector", columns[11], columns[12]]
df.select(f_columns).show(5)

+--------------------+---------------------+-------------+-----------+
|country_clean_vector|currency_clean_vector|days_campaign|hours_prepa|
+--------------------+---------------------+-------------+-----------+
|      (10,[0],[1.0])|        (8,[0],[1.0])|           45|      29.69|
|      (10,[0],[1.0])|        (8,[0],[1.0])|           30|      62.02|
|      (10,[0],[1.0])|        (8,[0],[1.0])|           30|      143.8|
|      (10,[0],[1.0])|        (8,[0],[1.0])|           40|       1.99|
|      (10,[0],[1.0])|        (8,[0],[1.0])|           91|     134.81|
+--------------------+---------------------+-------------+-----------+
only showing top 5 rows



Assembling the vectors

In [21]:
df_assembler = VectorAssembler(inputCols=f_columns, outputCol="features")
df = df_assembler.transform(df)

In [25]:
df.select("features", columns[5]).show(5, False)

+---------------------------------------+------------+
|features                               |final_status|
+---------------------------------------+------------+
|(20,[0,10,18,19],[1.0,1.0,45.0,29.69]) |0           |
|(20,[0,10,18,19],[1.0,1.0,30.0,62.02]) |1           |
|(20,[0,10,18,19],[1.0,1.0,30.0,143.8]) |0           |
|(20,[0,10,18,19],[1.0,1.0,40.0,1.99])  |0           |
|(20,[0,10,18,19],[1.0,1.0,91.0,134.81])|0           |
+---------------------------------------+------------+
only showing top 5 rows

