### Importing packages and starting spark context

In [2]:
# import packages
import findspark
import os
import pickle
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col , column
from pyspark import SparkContext

ModuleNotFoundError: No module named 'findspark'

In [82]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model

In [3]:
spark = SparkSession.builder \
     .appName("model-data") \
     .getOrCreate()
sc = spark.sparkContext

### Importing data and splitting into train/test sets

In [107]:
data_pandas = pickle.load(open('merged_data.pkl','rb'))
data_pandas.columns = [x.replace('.','_') for x in data_pandas.columns]
#data_pandas = data_pandas[data_pandas['Year'] == 2013]

In [108]:
sqlCtx = pyspark.SQLContext(sc)

In [109]:
data = sqlCtx.createDataFrame(data_pandas).rdd

In [110]:
data.first()

Row(Country='Albania', Year=1994, Max_Partners=75, GDP_per_unit_CO2=5.542105, PPP_Conv_Rate=26.714, PPP_Share_GDP=0.023, Imports_PC=0.0, Exports_PC=0.0, Govt_Revenue=-6.4239999999999995, gdp_per_cap=1493.7902013673001, agri_perc_gdp=36.4107030664902, agg_empl_agri_perc=54.840999603271506, rural_pop_perc=61.646, pop_tot=3207536.0, mobilesub_per100peeps=0.0, intl_tourist_arrival=1062000.0, total_life_exp=71.992, life_expectancy_fe=75.158, life_exp_male=69.15, trade_perGDP=53.1025847356112, ISO_A3='ALB', cam='F121994', mean_light=0.8159073253210255, mean_light_diff=0.5516382580084997)

In [111]:
# Split the data into training and test sets (30% held out for testing)
df_train, df_test = data.randomSplit([0.75,0.25], 5242)

### Model 1
Linear regression with many predictors

In [None]:
vecAssembler = VectorAssembler(inputCols=["GDP_per_unit_CO2", 
                                          "Imports_PC", 
                                          "Exports_PC", 
                                          "Govt_Revenue",
                                         "agri_perc_gdp",
                                         "mean_light",
                                         "intl_tourist_arrival",
                                         "mobilesub_per100peeps",
                                         "total_life_exp",
                                         "pop_tot",
                                         "Year"], 
                               outputCol="features")

In [None]:
# Train a linear regression model.
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
                            
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)
                            
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

### Model 2
Random Forest with many predictors

In [117]:
vecAssembler = VectorAssembler(inputCols=["GDP_per_unit_CO2", 
                                          "Imports_PC", 
                                          "Exports_PC", 
                                          "Govt_Revenue",
                                         "agri_perc_gdp",
                                         "mean_light",
                                         "intl_tourist_arrival",
                                         "mobilesub_per100peeps",
                                         "total_life_exp",
                                         "pop_tot",
                                         "Year"], 
                               outputCol="features")

In [113]:
# Train a RandomForest model.
rf = RandomForestRegressor(labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+------------------+------------------+--------------------+
|        prediction|       gdp_per_cap|            features|
+------------------+------------------+--------------------+
|2588.4499932749086|1493.7902013673001|[5.542105,0.0,0.0...|
| 2548.649267842038|1869.8712552067498|[7.47721,-4.94372...|
|2648.4344488518745|  2085.43199967837|[5.217429,38.743,...|
|2729.5459408893375|  2244.63109245948|[5.288641,-3.432,...|
|2729.5459408893375|  2453.63147595272|[5.476603,23.272,...|
+------------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 5683.55
RandomForestRegressionModel (uid=RandomForestRegressor_b5b92c53b518) with 20 trees


### Model 3
Linear regression with only mean light and year as predictors

In [114]:
vecAssembler = VectorAssembler(inputCols=["mean_light","Year"], 
                               outputCol="features")

In [118]:
# Train a linear regression model.
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, lr])
                            
# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)
                            
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+-----------------+------------------+--------------------+
|       prediction|       gdp_per_cap|            features|
+-----------------+------------------+--------------------+
| 8565.95065776445|1493.7902013673001|[5.542105,0.0,0.0...|
|8441.562879480422|1869.8712552067498|[7.47721,-4.94372...|
|6146.184772293549|  2085.43199967837|[5.217429,38.743,...|
|6343.281217123382|  2244.63109245948|[5.288641,-3.432,...|
|8151.897690195125|  2453.63147595272|[5.476603,23.272,...|
+-----------------+------------------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 13691.5
LinearRegression_0f4738ab2066


In [116]:
trainingSummary = model.summary

AttributeError: 'PipelineModel' object has no attribute 'summary'

### Model 4
Random Forest with only mean light and year as predictors

In [None]:
vecAssembler = VectorAssembler(inputCols=["mean_light","Year"], 
                               outputCol="features")

In [None]:
# Train a RandomForest model.
rf = RandomForestRegressor(labelCol="gdp_per_cap", featuresCol="features")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[vecAssembler, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(df_train.toDF())

# Make predictions.
predictions = model.transform(df_test.toDF())

# Select example rows to display.
predictions.select("prediction", "gdp_per_cap", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="gdp_per_cap", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only