### Importing packages and starting spark context

In [1]:
# import packages
import findspark
import os
import pickle
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col , column
from pyspark import SparkContext

In [2]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline, Model

#from pyspark.ml.feature import VectorIndexer

In [3]:
spark = SparkSession.builder \
     .appName("model-data") \
     .getOrCreate()
sc = spark.sparkContext

### Importing data and splitting into train/test sets

In [4]:
data_pandas = pickle.load(open('merged_data.pkl','rb'))
data_pandas.columns = [x.replace('.','_') for x in data_pandas.columns]
#data_pandas = data_pandas[data_pandas['Year'] == 2013]

In [5]:
sqlCtx = pyspark.SQLContext(sc)

In [6]:
data = sqlCtx.createDataFrame(data_pandas).rdd

  An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.readArrowStreamFromFile.
: java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:547)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.readNextBatch(ArrowConverters.scala:243)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.<init>(ArrowConverters.scala:229)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.getBatchesFromStream(ArrowConverters.scala:228)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anonfun$readArrowStreamFromFile$2.apply(ArrowConverters.scala:216)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anonfun$readArrowStreamFromFile$2.apply(ArrowConverters.scala:214)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2574)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.readArrowStreamFr

In [7]:
data.first()

Row(Country='Albania', Year=1993, Max_Partners=75, GDP_per_unit_CO2=5.26484, PPP_Conv_Rate=19.912, PPP_Share_GDP=0.022, Imports_PC=0.0, Exports_PC=0.0, Govt_Revenue=-6.424, gdp_per_cap=1370.8300162470098, agri_perc_gdp=36.4107030664902, agg_empl_agri_perc=55.470001220703104, rural_pop_perc=62.201, pop_tot=3227287.0, mobilesub_per100peeps=0.0, intl_tourist_arrival=1062000.0, total_life_exp=71.86, life_expectancy_fe=75.039, life_exp_male=69.03699999999999, trade_perGDP=80.518332770413, gdp_per_cap_index=100.0, ISO_A3='ALB', cam='F101993', mean_light=0.2642690673125257, mean_light_diff=nan, mean_light_pct_diff=nan, mean_light_index=100.0)

In [8]:
# Split the data into training and test sets (30% held out for testing)
df_train, df_test = data.randomSplit([0.75,0.25], 5242)

### Model 1
Linear regression with many predictors

In [9]:
vecAssembler = VectorAssembler(inputCols=["GDP_per_unit_CO2", 
                                          "Imports_PC", 
                                          "Exports_PC", 
                                          "Govt_Revenue",
                                         "agri_perc_gdp",
                                         "mean_light",
                                         "intl_tourist_arrival",
                                         "mobilesub_per100peeps",
                                         "total_life_exp",
                                         "pop_tot",
                                         "Year"], 
                               outputCol="features")

In [10]:
# Train a linear regression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
                            
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)
                            
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

lrModel = model.stages[1]
print(lrModel)  # summary only

+------------------+------------------+--------------------+
|        prediction|       gdp_per_cap|            features|
+------------------+------------------+--------------------+
| 3618.190015401924|1370.8300162470098|[5.26484,0.0,0.0,...|
| 4099.429490585346|1703.2867473347305|[6.90542899999999...|
|  6373.74568204931|  1835.65196486823|[7.673067,28.907,...|
|2590.3402672579978|  2085.43199967837|[5.217429,38.743,...|
| 4755.734628685517|  2244.63109245948|[5.288641,-3.432,...|
+------------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 14075.2
LinearRegression_821eb5f6578d


### Model 2
Linear regression with many predictors, indices used instead actual values for mean light and gdp per cap

In [11]:
vecAssembler = VectorAssembler(inputCols=["GDP_per_unit_CO2", 
                                          "Imports_PC", 
                                          "Exports_PC", 
                                          "Govt_Revenue",
                                         "agri_perc_gdp",
                                         "mean_light_index",
                                         "intl_tourist_arrival",
                                         "mobilesub_per100peeps",
                                         "total_life_exp",
                                         "pop_tot",
                                         "Year"], 
                               outputCol="features")

In [12]:
# Train a linear regression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="gdp_per_cap_index", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
                            
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap_index", "features").show(5)
                            
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap_index", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

lrModel = model.stages[1]
print(lrModel)  # summary only

+------------------+-----------------+--------------------+
|        prediction|gdp_per_cap_index|            features|
+------------------+-----------------+--------------------+
| 95.20914615334459|            100.0|[5.26484,0.0,0.0,...|
|134.02972715941087| 124.252221438651|[6.90542899999999...|
|129.79109202665404|133.9080661432981|[7.673067,28.907,...|
|155.75952860158122|152.1291462079129|[5.217429,38.743,...|
| 165.2711601517849|163.7424819894679|[5.288641,-3.432,...|
+------------------+-----------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 31.9452
LinearRegression_2cc7afd88618


### Model 3
Random Forest with many predictors

In [13]:
vecAssembler = VectorAssembler(inputCols=["GDP_per_unit_CO2", 
                                          "Imports_PC", 
                                          "Exports_PC", 
                                          "Govt_Revenue",
                                         "agri_perc_gdp",
                                         "mean_light",
                                         "intl_tourist_arrival",
                                         "mobilesub_per100peeps",
                                         "total_life_exp",
                                         "pop_tot",
                                         "Year"], 
                               outputCol="features")

In [14]:
# Train a RandomForest model
rf = RandomForestRegressor(labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+------------------+------------------+--------------------+
|        prediction|       gdp_per_cap|            features|
+------------------+------------------+--------------------+
|2614.5381855651494|1370.8300162470098|[5.26484,0.0,0.0,...|
| 3065.420300651952|1703.2867473347305|[6.90542899999999...|
| 2863.158558048729|  1835.65196486823|[7.673067,28.907,...|
|3215.9602658428526|  2085.43199967837|[5.217429,38.743,...|
|3215.9602658428526|  2244.63109245948|[5.288641,-3.432,...|
+------------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 6220.35
RandomForestRegressionModel (uid=RandomForestRegressor_f99d57853c9a) with 20 trees


### Model 4
Random Forest with many predictors, indices used instead actual values for mean light and gdp per cap

In [15]:
vecAssembler = VectorAssembler(inputCols=["GDP_per_unit_CO2", 
                                          "Imports_PC", 
                                          "Exports_PC", 
                                          "Govt_Revenue",
                                         "agri_perc_gdp",
                                         "mean_light_index",
                                         "intl_tourist_arrival",
                                         "mobilesub_per100peeps",
                                         "total_life_exp",
                                         "pop_tot",
                                         "Year"], 
                               outputCol="features")

In [16]:
# Train a RandomForest model
rf = RandomForestRegressor(labelCol="gdp_per_cap_index", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap_index", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap_index", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+------------------+-----------------+--------------------+
|        prediction|gdp_per_cap_index|            features|
+------------------+-----------------+--------------------+
| 104.4902253876747|            100.0|[5.26484,0.0,0.0,...|
| 115.9352621486014| 124.252221438651|[6.90542899999999...|
| 128.9266783166378|133.9080661432981|[7.673067,28.907,...|
|131.85593891111242|152.1291462079129|[5.217429,38.743,...|
|137.02851763172092|163.7424819894679|[5.288641,-3.432,...|
+------------------+-----------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 26.2329
RandomForestRegressionModel (uid=RandomForestRegressor_6b64ffb79fad) with 20 trees


### Model 5
Linear regression with only mean light and year as predictors

In [17]:
vecAssembler = VectorAssembler(inputCols=["mean_light","Year"], 
                               outputCol="features")

In [18]:
# Train a linear regression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
                            
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)
                            
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

lrModel = model.stages[1]
print(lrModel)  # summary only

+------------------+------------------+--------------------+
|        prediction|       gdp_per_cap|            features|
+------------------+------------------+--------------------+
| 8271.037677164772|1370.8300162470098|[0.26426906731252...|
|  9859.03266593418|1703.2867473347305|[1.25346159118950...|
| 9886.627258715569|  1835.65196486823|[0.74560160855458...|
|10425.093313362857|  2085.43199967837|[1.02403692363935...|
|11542.121118358977|  2244.63109245948|[1.79189325046840...|
+------------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 18020.8
LinearRegression_c55a9a314388


In [None]:
trainingSummary = model.summary

### Model 6
Linear regression with only mean light index and year as predictors, gdp per cap index as target

In [19]:
vecAssembler = VectorAssembler(inputCols=["mean_light_index","Year"], 
                               outputCol="features")

In [20]:
# Train a linear regression model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="gdp_per_cap_index", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
                            
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap_index", "features").show(5)
                            
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap_index", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

lrModel = model.stages[1]
print(lrModel)  # summary only

+------------------+-----------------+--------------------+
|        prediction|gdp_per_cap_index|            features|
+------------------+-----------------+--------------------+
|  93.3009336179166|            100.0|      [100.0,1993.0]|
| 134.4514533017482| 124.252221438651|[474.312640498011...|
|126.88314105815971|133.9080661432981|[282.137298979768...|
|139.93765892714055|152.1291462079129|[387.497838492132...|
| 170.0202718291821|163.7424819894679|[678.056372125194...|
+------------------+-----------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 32.7086
LinearRegression_615a1cd1a5dc


### Model 7
Random Forest with only mean light and year as predictors

In [21]:
vecAssembler = VectorAssembler(inputCols=["mean_light","Year"], 
                               outputCol="features")

In [22]:
# Train a RandomForest model
rf = RandomForestRegressor(labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+-----------------+------------------+--------------------+
|       prediction|       gdp_per_cap|            features|
+-----------------+------------------+--------------------+
|6248.330717664234|1370.8300162470098|[0.26426906731252...|
|7269.697413816684|1703.2867473347305|[1.25346159118950...|
|8397.220859717576|  1835.65196486823|[0.74560160855458...|
|7983.565311745966|  2085.43199967837|[1.02403692363935...|
| 8478.69258454948|  2244.63109245948|[1.79189325046840...|
+-----------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 16848.6
RandomForestRegressionModel (uid=RandomForestRegressor_aa2cf68cf711) with 20 trees


### Model 8
Random Forest with only mean light index and year as predictors, gdp per cap index as target

In [23]:
vecAssembler = VectorAssembler(inputCols=["mean_light_index","Year"], 
                               outputCol="features")

In [24]:
# Train a RandomForest model
rf = RandomForestRegressor(labelCol="gdp_per_cap_index", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap_index", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap_index", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+------------------+-----------------+--------------------+
|        prediction|gdp_per_cap_index|            features|
+------------------+-----------------+--------------------+
| 100.2450916792021|            100.0|      [100.0,1993.0]|
|129.21780460336925| 124.252221438651|[474.312640498011...|
|116.79302797221314|133.9080661432981|[282.137298979768...|
|133.21226186625523|152.1291462079129|[387.497838492132...|
|138.57887843029985|163.7424819894679|[678.056372125194...|
+------------------+-----------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 34.3643
RandomForestRegressionModel (uid=RandomForestRegressor_25d090f21c8e) with 20 trees


### Model 9
GBM with only mean light and year as predictors

In [25]:
vecAssembler = VectorAssembler(inputCols=["mean_light","Year"], 
                               outputCol="features")

In [26]:
# Train a GBT model
gbt = GBTRegressor(labelCol="gdp_per_cap", featuresCol="features", maxIter=20)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = model.stages[1]
print(gbtModel)  # summary only

+-----------------+------------------+--------------------+
|       prediction|       gdp_per_cap|            features|
+-----------------+------------------+--------------------+
|6489.928962321381|1370.8300162470098|[0.26426906731252...|
|6444.777231577415|1703.2867473347305|[1.25346159118950...|
|9354.440305248701|  1835.65196486823|[0.74560160855458...|
|5646.446407801378|  2085.43199967837|[1.02403692363935...|
|5804.858576409009|  2244.63109245948|[1.79189325046840...|
+-----------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 17463
GBTRegressionModel (uid=GBTRegressor_6be07f46e910) with 20 trees


### Model 10
GBM with only mean light index and year as predictors, gdp per cap index as target

In [27]:
vecAssembler = VectorAssembler(inputCols=["mean_light_index","Year"], 
                               outputCol="features")

In [28]:
# Train a GBT model
gbt = GBTRegressor(labelCol="gdp_per_cap_index", featuresCol="features", maxIter=20)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap_index", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap_index", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = model.stages[1]
print(gbtModel)  # summary only

+------------------+-----------------+--------------------+
|        prediction|gdp_per_cap_index|            features|
+------------------+-----------------+--------------------+
| 99.73519742258641|            100.0|      [100.0,1993.0]|
| 124.3592962939223| 124.252221438651|[474.312640498011...|
|108.79764880446149|133.9080661432981|[282.137298979768...|
|133.64055745965015|152.1291462079129|[387.497838492132...|
|139.17485094823215|163.7424819894679|[678.056372125194...|
+------------------+-----------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 39.6207
GBTRegressionModel (uid=GBTRegressor_e7916044ff68) with 20 trees
