In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from sklearn.metrics import accuracy_score





## LOADING TRANSFORMED DATA 

In [16]:
spark = SparkSession.builder.appName('KickStarter_ML').getOrCreate()
df = spark.read.csv('kickstarter_cleaned.csv', header = True, inferSchema = True)
df.printSchema()

# show features column
df.show(5)



root
 |-- main_category: integer (nullable = true)
 |-- currency: string (nullable = true)
 |-- deadline: date (nullable = true)
 |-- launched: date (nullable = true)
 |-- state: integer (nullable = true)
 |-- backers: integer (nullable = true)
 |-- country: integer (nullable = true)
 |-- usd_pledged_real: integer (nullable = true)
 |-- usd_goal_real: integer (nullable = true)
 |-- duration in days: integer (nullable = true)

+-------------+--------+----------+----------+-----+-------+-------+----------------+-------------+----------------+
|main_category|currency|  deadline|  launched|state|backers|country|usd_pledged_real|usd_goal_real|duration in days|
+-------------+--------+----------+----------+-----+-------+-------+----------------+-------------+----------------+
|            6|     USD|2012-08-10|2012-07-07|    0|     12|     21|             296|         4000|              34|
|            6|     USD|2014-01-05|2013-11-21|    1|    148|     21|           25712|        25000|   

## Prepping our pipeline

In [17]:
numericCols = ['main_category','backers','country','usd_pledged_real','usd_goal_real','duration in days']
featurizationPipeline = Pipeline(stages = [VectorAssembler(inputCols=numericCols, outputCol="feature_vector")])
featurizationPipelineModel = featurizationPipeline.fit(df)
df = featurizationPipelineModel.transform(df)
train, test = df.randomSplit([0.7, 0.3], seed = 2018)



## Logistic Regression model

In [18]:
lr = LogisticRegression(featuresCol = 'feature_vector', labelCol = 'state', maxIter=10)
lrModel = lr.fit(train)

# Make predictions on the test set.
predictions = lrModel.transform(test)

### Model Evaluating.

In [19]:


true_labels=predictions.select('state')
lr_predictions=predictions.select('prediction')

accuracy = accuracy_score(true_labels.toPandas(), lr_predictions.toPandas())
print("Logistic Regression Accuracy =",accuracy*100,"%")

Logistic Regression Accuracy = 79.6331976807001 %


## Decision tree classifier

In [20]:

dt = DecisionTreeClassifier(featuresCol = 'feature_vector', labelCol = 'state', maxDepth = 3)

dtModel = dt.fit(train)
predictions = dtModel.transform(test)

### Evaluation

In [21]:

true_labels=predictions.select('state')
dt_predictions=predictions.select('prediction')

accuracy = accuracy_score(true_labels.toPandas(), dt_predictions.toPandas())
print("Decision Tree Accuracy =",accuracy*100,"%")



Decision Tree Accuracy = 92.01840314041849 %


## Random Forest Classifier 

In [22]:

rf = RandomForestClassifier(featuresCol = 'feature_vector', labelCol = 'state', numTrees=10)
rfModel = rf.fit(train)
predictions = rfModel.transform(test)



### Evaluation

In [23]:

true_labels=predictions.select('state')
rf_predictions=predictions.select('prediction')

accuracy = accuracy_score(true_labels.toPandas(), rf_predictions.toPandas())
print("Random Forest Accuracy =",accuracy*100,"%")



Random Forest Accuracy = 95.75935462959629 %


## Gradient Boosted tree classifier

In [24]:

gbt = GBTClassifier(featuresCol = 'feature_vector', labelCol = 'state', maxIter=10)
gbtModel = gbt.fit(train)
predictions = gbtModel.transform(test)



### Evaluation

In [25]:

true_labels=predictions.select('state')
gbt_predictions=predictions.select('prediction')

accuracy = accuracy_score(true_labels.toPandas(), gbt_predictions.toPandas())
print("Gradient Boosted Tree Accuracy =",accuracy*100,"%")

Gradient Boosted Tree Accuracy = 98.02553390715599 %


### It is understandable why the logistic regression model performed slightly worse than its peers , due the the high number of outliers across our data set , however 79% is considered acceptable