In [1]:
#IMPORTING ALL NECESSARY LIBRARIES

from pyspark.sql import SparkSession
import pandas as pd

In [2]:
#CREATING A SPARKSESSION

spark_session = SparkSession.builder.appName('SparkML Practice').getOrCreate()

In [3]:
spark_session

In [35]:
#READING THE DATASET

data = spark_session.read.csv("D:\\Datasets\\Spark_dataset.csv", header=True, inferSchema=True)

In [36]:
data

DataFrame[Name: string, age: int, Experience: int, Salary: int]

In [37]:
data.show()

+--------+---+----------+------+
|    Name|age|Experience|Salary|
+--------+---+----------+------+
|  Hitesh| 27|         5| 30000|
|  Mahesh| 30|         8|250000|
|   Rohit| 29|         4| 20000|
|  Rakesh| 24|         3|200000|
|Harshali| 21|         1| 15000|
|  Khushi| 23|         2|180000|
|   Shyam| 26|         7| 90000|
|  Meghna| 23|         5| 50000|
|   Pooja| 46|         8|100000|
+--------+---+----------+------+



In [38]:
#DISPLAYING THE SCHEMA OF THE DATAFRAME

data.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [39]:
#DISPLAYING THE ONLY THE COLUMNS OF DATAFRAME

data.columns

['Name', 'age', 'Experience', 'Salary']

In [40]:
#TO GROUP THE INDENPENDENT FEATURES (HERE 'AGE' & 'EXPERIENCE') USING THE CONCEPT OF VECTOR ASSEMBLER

from pyspark.ml.feature import VectorAssembler

feature_assembler = VectorAssembler(inputCols=['age','Experience'],outputCol='Independent Feature')

In [41]:
#TO MAKE CHANGES IN THE DATAFRAME, WE NEED TO USE THE transform() FUNCTION

output = feature_assembler.transform(data)

In [42]:
output.show()

+--------+---+----------+------+-------------------+
|    Name|age|Experience|Salary|Independent Feature|
+--------+---+----------+------+-------------------+
|  Hitesh| 27|         5| 30000|         [27.0,5.0]|
|  Mahesh| 30|         8|250000|         [30.0,8.0]|
|   Rohit| 29|         4| 20000|         [29.0,4.0]|
|  Rakesh| 24|         3|200000|         [24.0,3.0]|
|Harshali| 21|         1| 15000|         [21.0,1.0]|
|  Khushi| 23|         2|180000|         [23.0,2.0]|
|   Shyam| 26|         7| 90000|         [26.0,7.0]|
|  Meghna| 23|         5| 50000|         [23.0,5.0]|
|   Pooja| 46|         8|100000|         [46.0,8.0]|
+--------+---+----------+------+-------------------+



In [43]:
type(output)

pyspark.sql.dataframe.DataFrame

In [44]:
#NOW THE COLUMNS OF THE OUTPUT DATAFRAME
output.columns

['Name', 'age', 'Experience', 'Salary', 'Independent Feature']

In [45]:
#NOW SEPARATING THE DATA INTO INDEPENDENT AND DEPENDENT FEATURES

#TRAINING SET ==> INDEPENDENT FEATURES ---> X; SALARY ---> Y

training = output.select('Independent Feature','Salary')

In [46]:
training.show()

+-------------------+------+
|Independent Feature|Salary|
+-------------------+------+
|         [27.0,5.0]| 30000|
|         [30.0,8.0]|250000|
|         [29.0,4.0]| 20000|
|         [24.0,3.0]|200000|
|         [21.0,1.0]| 15000|
|         [23.0,2.0]|180000|
|         [26.0,7.0]| 90000|
|         [23.0,5.0]| 50000|
|         [46.0,8.0]|100000|
+-------------------+------+



In [47]:
#APPLYING LINEAR REGRESSION 
from pyspark.ml.regression import LinearRegression

#SPLITTING DATA INTO TRAIN AND TEST DATA
#THE RATIO OF SPLIT IS TRAIN --> 75% & TEST --> 25%
train_data, test_data = training.randomSplit([0.75,0.25])

In [48]:
#APPLYING LINEAR REGRESSION 

regressor = LinearRegression(featuresCol="Independent Feature",labelCol="Salary")
regressor = regressor.fit(train_data)

In [49]:
regressor.coefficients

DenseVector([-1023.969, 8912.5837])

In [50]:
regressor.intercept

99085.30137469135

In [51]:
#Prediction 

pred = regressor.evaluate(test_data)

In [52]:
pred.predictions.show()

+-------------------+------+------------------+
|Independent Feature|Salary|        prediction|
+-------------------+------+------------------+
|         [29.0,4.0]| 20000|105040.53577722954|
+-------------------+------+------------------+

