<a href="https://www.kaggle.com/code/ankanhore545/linear-regression-using-pyspark?scriptVersionId=96907210" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Installing Pyspark package

In [1]:
!pip install PySpark

Collecting PySpark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: PySpark
  Building wheel for PySpark (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
[?25h  Created wheel for PySpark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=8a9d65a3adeecd688ba3692c948bf22fd91d09c0f4cd39fbfd673ba627518f84
  Stored in d

In [2]:
import pyspark

# Creating the first Pyspark session

In [3]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('CostofLiving').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/29 11:39:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
#Checking the input file directory
import os
print(os.listdir("../input"))

['cost-of-living-index-2022']


# Read the I/P file and check the columns

In [5]:
df_train= spark.read.csv("../input/cost-of-living-index-2022/Cost_of_Living_Index_2022.csv", header=True, inferSchema=True)
df_train.show()

                                                                                

+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+
|Rank|             Country|Cost of Living Index|Rent Index|Cost of Living Plus Rent Index|Groceries Index|Restaurant Price Index|Local Purchasing Power Index|
+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+
|   1|         Afghanistan|               20.37|      2.72|                         12.09|          14.92|                 12.41|                       23.04|
|   2|             Albania|                35.5|      8.47|                         22.83|          29.32|                 25.82|                       30.19|
|   3|             Algeria|               26.87|      4.59|                         16.43|          28.82|                 14.48|                       24.63|
|   4|           Argentina|               34.6

In [6]:
df_train.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Cost of Living Index: double (nullable = true)
 |-- Rent Index: double (nullable = true)
 |-- Cost of Living Plus Rent Index: double (nullable = true)
 |-- Groceries Index: double (nullable = true)
 |-- Restaurant Price Index: double (nullable = true)
 |-- Local Purchasing Power Index: double (nullable = true)



In [7]:
df_train.columns

['Rank',
 'Country',
 'Cost of Living Index',
 'Rent Index',
 'Cost of Living Plus Rent Index',
 'Groceries Index',
 'Restaurant Price Index',
 'Local Purchasing Power Index']

**[Rent Index, Cost of Living Plus Rent Index, Groceries Index, Restaurant Price Index, Local Purchasing Power Index]--> new feature--> independant feature**

# Invoking VectorAssembler for grouping the required features

In [8]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['Rent Index','Cost of Living Plus Rent Index','Groceries Index','Restaurant Price Index','Local Purchasing Power Index'], outputCol='Independant Features')

In [9]:
output=featureassembler.transform(df_train)

In [10]:
output.show()

+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+--------------------+
|Rank|             Country|Cost of Living Index|Rent Index|Cost of Living Plus Rent Index|Groceries Index|Restaurant Price Index|Local Purchasing Power Index|Independant Features|
+----+--------------------+--------------------+----------+------------------------------+---------------+----------------------+----------------------------+--------------------+
|   1|         Afghanistan|               20.37|      2.72|                         12.09|          14.92|                 12.41|                       23.04|[2.72,12.09,14.92...|
|   2|             Albania|                35.5|      8.47|                         22.83|          29.32|                 25.82|                       30.19|[8.47,22.83,29.32...|
|   3|             Algeria|               26.87|      4.59|                         16.43|          

In [11]:
output.columns

['Rank',
 'Country',
 'Cost of Living Index',
 'Rent Index',
 'Cost of Living Plus Rent Index',
 'Groceries Index',
 'Restaurant Price Index',
 'Local Purchasing Power Index',
 'Independant Features']

# Create the final output with the desired target variable

In [12]:
fin_output= output.select("Independant Features", "Cost of Living Index")

In [13]:
fin_output.show()

+--------------------+--------------------+
|Independant Features|Cost of Living Index|
+--------------------+--------------------+
|[2.72,12.09,14.92...|               20.37|
|[8.47,22.83,29.32...|                35.5|
|[4.59,16.43,28.82...|               26.87|
|[7.71,22.04,28.17...|               34.69|
|[11.61,23.45,27.5...|               33.89|
|[36.84,58.57,77.4...|               77.75|
|[27.13,50.46,65.8...|               71.04|
|[7.86,19.48,26.57...|               29.73|
|[35.34,61.19,70.5...|                84.0|
|[29.22,42.79,44.5...|               54.77|
|[4.42,19.67,30.41...|               33.13|
|[21.99,59.38,87.8...|               92.37|
|[9.81,21.01,27.24...|               30.89|
|[25.79,50.67,63.3...|               72.61|
|[11.64,32.71,48.7...|                51.3|
|[98.58,123.8,148....|              146.04|
|[10.18,23.24,31.2...|               34.77|
|[6.82,22.39,31.14...|               36.12|
|[10.21,26.12,35.1...|               40.17|
|[8.27,21.54,28.16...|          

# Baseline Model Training using Linear Regression

In [14]:
from pyspark.ml.regression import LinearRegression

#train_test_split
train_X, test_X= fin_output.randomSplit([0.8, 0.2])
reg=LinearRegression(featuresCol='Independant Features', labelCol='Cost of Living Index')
reg=reg.fit(train_X)

22/05/29 11:40:04 WARN Instrumentation: [7b69bb84] regParam is zero, which might cause numerical instability and overfitting.


In [15]:
reg.coefficients

DenseVector([-0.8824, 1.8827, -0.0001, -0.0001, 0.0])

In [16]:
reg.intercept

-1.517951100909274e-05

# Model Evaluation

In [17]:
pred=reg.evaluate(test_X)

In [18]:
pred.predictions.show()



+--------------------+--------------------+------------------+
|Independant Features|Cost of Living Index|        prediction|
+--------------------+--------------------+------------------+
|[3.91,12.42,17.81...|               19.92|19.929475066835998|
|[5.16,20.79,39.39...|               34.58| 34.58082553906267|
|[5.32,17.3,27.36,...|               27.87| 27.87175210554195|
|[8.27,21.54,28.16...|               33.24| 33.24967688497975|
|[8.47,22.83,29.32...|                35.5| 35.50173274046357|
|[8.7,20.2,24.96,2...|               30.35| 30.34767812849746|
|[8.73,22.11,31.27...|               33.92| 33.91621097712515|
|[9.07,24.64,34.02...|               38.38|38.379065582692576|
|[9.19,22.1,31.0,2...|                33.5| 33.49130932068134|
|[9.44,19.66,25.35...|               28.68| 28.67791681263471|
|[9.82,23.65,38.29...|               35.85|35.854186503929725|
|[10.04,20.75,25.4...|                30.2|30.200780838352383|
|[11.22,22.54,30.7...|               32.53| 32.52943605

In [19]:
pred.meanAbsoluteError, pred.meanSquaredError

(0.004824929998478074, 3.695125038346004e-05)