### Importing packages and starting spark context

In [1]:
# import packages
import findspark
import os
import pickle
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col , column
from pyspark import SparkContext

In [2]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline, Model

#from pyspark.ml.feature import VectorIndexer

In [3]:
spark = SparkSession.builder \
     .appName("model-data") \
     .getOrCreate()
sc = spark.sparkContext

### Importing data and splitting into train/test sets

In [4]:
data_pandas = pickle.load(open('merged_data.pkl','rb'))
data_pandas.columns = [x.replace('.','_') for x in data_pandas.columns]
#data_pandas = data_pandas[data_pandas['Year'] == 2013]

In [5]:
sqlCtx = pyspark.SQLContext(sc)

In [6]:
data = sqlCtx.createDataFrame(data_pandas).rdd

  An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.readArrowStreamFromFile.
: java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:547)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.readNextBatch(ArrowConverters.scala:243)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.<init>(ArrowConverters.scala:229)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.getBatchesFromStream(ArrowConverters.scala:228)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anonfun$readArrowStreamFromFile$2.apply(ArrowConverters.scala:216)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anonfun$readArrowStreamFromFile$2.apply(ArrowConverters.scala:214)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2574)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.readArrowStreamFr

In [7]:
data.first()

Row(Country='Albania', Year=1994, Max_Partners=75, GDP_per_unit_CO2=5.542105, PPP_Conv_Rate=26.714, PPP_Share_GDP=0.023, Imports_PC=0.0, Exports_PC=0.0, Govt_Revenue=-6.4239999999999995, gdp_per_cap=1493.7902013673001, agri_perc_gdp=36.4107030664902, agg_empl_agri_perc=54.840999603271506, rural_pop_perc=61.646, pop_tot=3207536.0, mobilesub_per100peeps=0.0, intl_tourist_arrival=1062000.0, total_life_exp=71.992, life_expectancy_fe=75.158, life_exp_male=69.15, trade_perGDP=53.1025847356112, ISO_A3='ALB', cam='F121994', mean_light=0.8159073253210255, mean_light_diff=0.5516382580084997)

In [8]:
# Split the data into training and test sets (30% held out for testing)
df_train, df_test = data.randomSplit([0.75,0.25], 5242)

### Model 1
Linear regression with many predictors

In [9]:
vecAssembler = VectorAssembler(inputCols=["GDP_per_unit_CO2", 
                                          "Imports_PC", 
                                          "Exports_PC", 
                                          "Govt_Revenue",
                                         "agri_perc_gdp",
                                         "mean_light",
                                         "intl_tourist_arrival",
                                         "mobilesub_per100peeps",
                                         "total_life_exp",
                                         "pop_tot",
                                         "Year"], 
                               outputCol="features")

In [10]:
# Train a linear regression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
                            
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)
                            
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

lrModel = model.stages[1]
print(lrModel)  # summary only

+-----------------+------------------+--------------------+
|       prediction|       gdp_per_cap|            features|
+-----------------+------------------+--------------------+
| 6793.08603833802|1493.7902013673001|[5.542105,0.0,0.0...|
|6649.516431187745|1869.8712552067498|[7.47721,-4.94372...|
|7245.683225839399|  2085.43199967837|[5.217429,38.743,...|
|6125.136547465809|  2244.63109245948|[5.288641,-3.432,...|
|7887.005368274171|  2453.63147595272|[5.476603,23.272,...|
+-----------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 14178.7
LinearRegression_7072f3df54a0


### Model 2
Random Forest with many predictors

In [11]:
vecAssembler = VectorAssembler(inputCols=["GDP_per_unit_CO2", 
                                          "Imports_PC", 
                                          "Exports_PC", 
                                          "Govt_Revenue",
                                         "agri_perc_gdp",
                                         "mean_light",
                                         "intl_tourist_arrival",
                                         "mobilesub_per100peeps",
                                         "total_life_exp",
                                         "pop_tot",
                                         "Year"], 
                               outputCol="features")

In [12]:
# Train a RandomForest model
rf = RandomForestRegressor(labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+------------------+------------------+--------------------+
|        prediction|       gdp_per_cap|            features|
+------------------+------------------+--------------------+
| 3115.043094636579|1493.7902013673001|[5.542105,0.0,0.0...|
|3032.3601009110816|1869.8712552067498|[7.47721,-4.94372...|
|  3188.33139315203|  2085.43199967837|[5.217429,38.743,...|
|  3188.33139315203|  2244.63109245948|[5.288641,-3.432,...|
|3268.4987165458724|  2453.63147595272|[5.476603,23.272,...|
+------------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 5675.03
RandomForestRegressionModel (uid=RandomForestRegressor_2507e40730a4) with 20 trees


### Model 3
Linear regression with only mean light and year as predictors

In [13]:
vecAssembler = VectorAssembler(inputCols=["mean_light","Year"], 
                               outputCol="features")

In [14]:
# Train a linear regression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
                            
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)
                            
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

lrModel = model.stages[1]
print(lrModel)  # summary only

+------------------+------------------+--------------------+
|        prediction|       gdp_per_cap|            features|
+------------------+------------------+--------------------+
|10191.756502234435|1493.7902013673001|[0.81590732532102...|
|11178.062952535489|1869.8712552067498|[1.45642736370698...|
|11037.505660530005|  2085.43199967837|[1.02403692363935...|
| 12051.13618129256|  2244.63109245948|[1.79189325046840...|
|12831.834547041944|  2453.63147595272|[2.35943426404057...|
+------------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 18560.3
LinearRegression_4cfc1e5ec982


In [15]:
trainingSummary = model.summary

AttributeError: 'PipelineModel' object has no attribute 'summary'

### Model 4
Random Forest with only mean light and year as predictors

In [16]:
vecAssembler = VectorAssembler(inputCols=["mean_light","Year"], 
                               outputCol="features")

In [17]:
# Train a RandomForest model
rf = RandomForestRegressor(labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+------------------+------------------+--------------------+
|        prediction|       gdp_per_cap|            features|
+------------------+------------------+--------------------+
|  8556.06500585829|1493.7902013673001|[0.81590732532102...|
|7726.2937849379105|1869.8712552067498|[1.45642736370698...|
| 8311.545749476265|  2085.43199967837|[1.02403692363935...|
|   8146.2158511715|  2244.63109245948|[1.79189325046840...|
| 8320.783025050743|  2453.63147595272|[2.35943426404057...|
+------------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 17651.8
RandomForestRegressionModel (uid=RandomForestRegressor_f0a70994a3f7) with 20 trees


### Model 5
GBM with only mean light and year as predictors

In [18]:
vecAssembler = VectorAssembler(inputCols=["mean_light","Year"], 
                               outputCol="features")

In [19]:
# Train a GBT model
gbt = GBTRegressor(labelCol="gdp_per_cap", featuresCol="features", maxIter=20)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = model.stages[1]
print(gbtModel)  # summary only

+------------------+------------------+--------------------+
|        prediction|       gdp_per_cap|            features|
+------------------+------------------+--------------------+
| 7176.535236975383|1493.7902013673001|[0.81590732532102...|
| 6400.917916514489|1869.8712552067498|[1.45642736370698...|
|5963.1827261712815|  2085.43199967837|[1.02403692363935...|
| 6869.203020798838|  2244.63109245948|[1.79189325046840...|
| 4606.107522968545|  2453.63147595272|[2.35943426404057...|
+------------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 18022.8
GBTRegressionModel (uid=GBTRegressor_c34580b921f2) with 20 trees
