# Linear Regression for 2015 data

In [None]:
#SparkSession is the entry point to Spark SQL. It is the very first object 
#to create while developing Spark SQL applications.
#Used the SparkSession.builder method to create an instance of SparkSession with appName('WorldHealth')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('WorldHealth').getOrCreate()

In [None]:
# Read the data from the HR_comma_sep.csv file into df1 dataframe.
df1 = spark.read.csv ('WH_2015.csv', inferSchema=True, header =True)

In [None]:
#It displays the schema of the dataframe df1
df1.printSchema()

In [None]:
#Importing the module StringIndexer from subpackage ml.feature
from pyspark.ml.feature import StringIndexer

In [None]:
#In first step by using StringIndexer function we are creating an output label 'Country_Lab' with input 'Country', as we need integer values for performing logistic regression
#We are transforming the above result into indexed dataframe and showing the first 10 results.
indexer = StringIndexer(inputCol='Country', outputCol='Country_Lab')
indexed = indexer.fit(df1).transform(df1)
indexed.head(5)

In [None]:
#Importing the modules Vectors & VectorAssembler from subpackage ml.linalg & ml.feature
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
#Displays the columns in the indexed dataframe
indexed.columns

In [None]:
#By using VectorAsssembler creating features output column with input columns including everything except label column
#Happiness Rank and Region which is not much required for prediction
assembler = VectorAssembler(inputCols=[
 'Happiness Score',
 'Standard Error',
 'Economy (GDP per Capita)',
 'Family',
 'Health (Life Expectancy)',
 'Freedom',
 'Trust (Government Corruption)',
 'Generosity',
 'Dystopia Residual',
 'Country_Lab'
], outputCol='features')

In [None]:
#Transforming the indexed dataframe to the dataframe output
output = assembler.transform(indexed)

In [None]:
#Selecting only required columns or features into final data dataframe
final_data = output.select(['features', 'Happiness Rank'])

In [None]:
#Splitting the actual data into traindata & test data which is of 70% & 30%
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [None]:
#Importing the LinearRegression module from the regression subpackage
from pyspark.ml.regression import LinearRegression

In [None]:
#Selecting the label coulmn for this problem as 'Happiness Rank' and applying on LinearRegression function 
#and assigining to HR dataframe.
HR = LinearRegression(labelCol='Happiness Rank')

In [None]:
#Trying to fit the model using hyper-parameter - regparam of 100
trained_HR_model = HR.fit(final_data,{lr.regParam:100.0})

In [None]:
#Trying to fit the model on test data and storing it to HR_results dataframe for measuring the accuracy of that model
HR_results = trained_HR_model.evaluate(test_data)

In [None]:
#Provides the metric RMSE value
HR_results.rootMeanSquaredError

In [None]:
#Provides the accuracy
HR_results.r2

In [None]:
#Transform the final_data into predictionsA and show the predictions
predictionsA = trained_HR_model.transform(final_data)
display(predictionsA)
predictionsA.show()

# Linear Regression for 2016 data

In [None]:
#SparkSession is the entry point to Spark SQL. It is the very first object 
#to create while developing Spark SQL applications.
#Used the SparkSession.builder method to create an instance of SparkSession with appName('WorldHealth')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('WorldHealth1').getOrCreate()

In [None]:
# Read the data from the HR_comma_sep.csv file into df1 dataframe.
df2 = spark.read.csv ('WH_2016.csv', inferSchema=True, header =True)

In [None]:
#It displays the schema of the dataframe df1
df2.printSchema()

In [None]:
#Importing the module StringIndexer from subpackage ml.feature
from pyspark.ml.feature import StringIndexer

In [None]:
#In first step by using StringIndexer function we are creating an output label 'Country_Lab' with input 'Country', as we need integer values for performing logistic regression
#We are transforming the above result into indexed dataframe and showing the first 10 results.
indexer = StringIndexer(inputCol='Country', outputCol='Country_Lab')
indexed = indexer.fit(df2).transform(df2)
indexed.head(5)

In [None]:
#Importing the modules Vectors & VectorAssembler from subpackage ml.linalg & ml.feature
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
#Displays the columns in the indexed dataframe
indexed.columns

In [None]:
#By using VectorAsssembler creating features output column with input columns including everything except label column
#Happiness Rank and taken columns that re common with 2015 data.
assembler = VectorAssembler(inputCols=[
 'Happiness Score',
 'Economy (GDP per Capita)',
 'Family',
 'Health (Life Expectancy)',
 'Freedom',
 'Trust (Government Corruption)',
 'Generosity',
 'Dystopia Residual',
 'Country_Lab'
], outputCol='features')

In [None]:
#Transforming the indexed dataframe to the dataframe output1
output1 = assembler.transform(indexed)

In [None]:
#Selecting only required columns or features into final data dataframe
final_data = output1.select(['features', 'Happiness Rank'])

In [None]:
#Splitting the actual data into traindata & test data which is of 70% & 30%
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [None]:
#Importing the LinearRegression module from the regression subpackage
from pyspark.ml.regression import LinearRegression

In [None]:
#Selecting the label coulmn for this problem as 'Happiness Rank' and applying on LinearRegression function 
#and assigining to HR dataframe.
HR = LinearRegression(labelCol='Happiness Rank')

In [None]:
#Trying to fit the model on train data
trained_HR_model = HR.fit(train_data)

In [None]:
#Trying to fit the model on test data and storing it to HR_results dataframe for measuring the accuracy of that model
HR_result_1 = trained_HR_model.evaluate(test_data)

In [None]:
#Provides the metric RMSE value
HR_result_1.rootMeanSquaredError

In [None]:
#Provides the accuracy
HR_result_1.r2

# Linear Regression for 2017 data

In [None]:
#SparkSession is the entry point to Spark SQL. It is the very first object 
#to create while developing Spark SQL applications.
#Used the SparkSession.builder method to create an instance of SparkSession with appName('WorldHealth2')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('WorldHealth2').getOrCreate()

In [None]:
# Read the data from the HR_comma_sep.csv file into df1 dataframe.
df3 = spark.read.csv ('WH_2017.csv', inferSchema=True, header =True)

In [None]:
#It displays the schema of the dataframe df1
df3.printSchema()

In [None]:
#Importing the regularExpression module or function
import re

In [None]:
#Replacing the column names in df3 with any dots('.') with '_' and placing it new dataframe 'df3_test'
#I'm doing it as in pyspark it will not take or consider any dots in column name specification, so replacing with '_'.
df3_test = df3.toDF(*(re.sub(r'[\.\s]+', '_', c) for c in df3.columns))

In [None]:
df3_test.printSchema()

In [None]:
#Importing the module StringIndexer from subpackage ml.feature
from pyspark.ml.feature import StringIndexer

In [None]:
#In first step by using StringIndexer function we are creating an output label 'Country_Lab' with input 'Country', as we need integer values for performing logistic regression
#We are transforming the above result into indexed dataframe and showing the first 10 results.
indexer = StringIndexer(inputCol='Country', outputCol='Country_Lab')
indexed = indexer.fit(df3_test).transform(df3_test)
indexed.head(5)

In [None]:
#Importing the modules Vectors & VectorAssembler from subpackage ml.linalg & ml.feature
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
#Displays the columns in the indexed dataframe
indexed.columns

In [None]:
#By using VectorAsssembler creating features output column with input columns including everything except label column
#Happiness Rank and taken columns that re common with 2015 data.
assembler = VectorAssembler(inputCols=[
 'Happiness_Score',
 'Economy_GDP_per_Capita_',
 'Family',
 'Health_Life_Expectancy_',
 'Freedom',
 'Generosity',
 'Trust_Government_Corruption_',
 'Dystopia_Residual',
 'Country_Lab'
], outputCol='features')

In [None]:
#Transforming the indexed dataframe to the dataframe output2
output2 = assembler.transform(indexed)

In [None]:
#Selecting only required columns or features into final dataframe
final = output2.select(['features', 'Happiness_Rank'])

In [None]:
#Splitting the actual data into traindata & test data which is of 70% & 30%
train_data, test_data = final.randomSplit([0.7, 0.3])

In [None]:
#Importing the LinearRegression module from the regression subpackage
from pyspark.ml.regression import LinearRegression

In [None]:
#Selecting the label coulmn for this problem as 'Happiness_Rank' and applying on LinearRegression function 
#and assigining to HR_model dataframe.
HR_model = LinearRegression(labelCol='Happiness_Rank')

In [None]:
#Trying to fit the model on train data
trained_model = HR_model.fit(train_data)

In [None]:
#Trying to fit the model on test data and storing it to HR_result_2 dataframe for measuring the accuracy of that model
HR_result_2 = trained_model.evaluate(test_data)

In [None]:
#Provides the metric RMSE value
HR_result_2.rootMeanSquaredError

In [None]:
#Provides the accuracy
HR_result_2.r2