<a href="https://colab.research.google.com/github/Ashish-Soni08/100-Days-Of-Code/blob/main/Day_6_Admission_prediction_with_pyspark_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TASK 1 : Install Dependencies & Run a SparkSession


In [None]:
#install pyspark

! pip install pyspark --quiet

[K     |████████████████████████████████| 212.4 MB 66 kB/s 
[K     |████████████████████████████████| 198 kB 52.6 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
#create a sparksession

from pyspark.sql import SparkSession

# SparkSession provides  a single point entry to interact with the underlying spark functionality 
# and allows spark programming with Dataframe and Dataset APIs

spark = SparkSession.builder.appName("spark").getOrCreate()    # builder method: for constructing a spark session
# getOrCreate: a new spark session or returns an existing spark session

# TASK 2 : Clone & Explore dataset

In [None]:
#clone the dataset
! git clone https://github.com/education454/admission_dataset

Cloning into 'admission_dataset'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), done.


In [None]:
#check the presence of dataset

!ls

admission_dataset  sample_data


In [None]:
# create a spark dataframe (difference to pandas dataframes, spark dataframes 
# are immutable , you have to create a new dataframe with updated values)

df = spark.read.csv('/content/admission_dataset/Admission_Predict_Ver1.1.csv', header = True, inferSchema = True)


In [None]:
#display dataframe

df.show()

+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|Serial No|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+---------+-----------+-----------------+---+---+----+--------+---------------+
|        1|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|        2|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|        3|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|        4|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|        5|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|        6|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|        7|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|        8|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|        9

In [None]:
#get the no.of rows & columns

print('Nunber of rows: ', df.count())
print('Number of columns: ', len(df.columns))

Nunber of rows:  500
Number of columns:  9


In [None]:
#print schema 

df.printSchema()

root
 |-- Serial No: integer (nullable = true)
 |-- GRE Score: integer (nullable = true)
 |-- TOEFL Score: integer (nullable = true)
 |-- University Rating: integer (nullable = true)
 |-- SOP: double (nullable = true)
 |-- LOR: double (nullable = true)
 |-- CGPA: double (nullable = true)
 |-- Research: integer (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [None]:
#get the summary statistics

df.describe().show()

+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|        Serial No|         GRE Score|      TOEFL Score|University Rating|               SOP|               LOR|              CGPA|          Research|    Chance of Admit|
+-------+-----------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|              500|               500|              500|              500|               500|               500|               500|               500|                500|
|   mean|            250.5|           316.472|          107.192|            3.114|             3.374|             3.484| 8.576440000000003|              0.56| 0.7217399999999996|
| stddev|144.4818327679989|11.295148372354712|6.081867659564538|1.143511800759815|0.9910036207566072|0.92

# TASK 3 : Data Cleaning

In [None]:
#drop the unnecessary column

df = df.drop('Serial No')

In [None]:
#display the dataframe

df.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75|
|      308|        101|                2|3.0|4.0| 7.9|       0|           0.68|
|      302|        102|                1|2.0|1.5| 8.0|       0|            0.5|
|      323|        108|                3

In [None]:
#check for null values

for i in df.columns:
  print(i + ":",df[df[i].isNull()].count())

GRE Score: 0
TOEFL Score: 0
University Rating: 0
SOP: 0
LOR: 0
CGPA: 0
Research: 0
Chance of Admit: 0


# TASK 4 : Correlation Analysis & Feature Selection

In [None]:
# correlation analysis

for col in df.columns:
  print('Correlation to Chance of Admit variable for {} is {}'.format(col, df.stat.corr('Chance of Admit', col)))

Correlation to Chance of Admit variable for GRE Score is 0.8103506354632598
Correlation to Chance of Admit variable for TOEFL Score is 0.7922276143050823
Correlation to Chance of Admit variable for University Rating is 0.6901323687886892
Correlation to Chance of Admit variable for SOP is 0.6841365241316723
Correlation to Chance of Admit variable for LOR is 0.6453645135280112
Correlation to Chance of Admit variable for CGPA is 0.882412574904574
Correlation to Chance of Admit variable for Research is 0.5458710294711379
Correlation to Chance of Admit variable for Chance of Admit is 1.0


In [None]:
# feature selection

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['GRE Score', 'TOEFL Score', 'CGPA'], outputCol = 'features')

In [None]:
#display dataframe

output_data = assembler.transform(df)

output_data.show()

+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|          features|
+---------+-----------+-----------------+---+---+----+--------+---------------+------------------+
|      337|        118|                4|4.5|4.5|9.65|       1|           0.92|[337.0,118.0,9.65]|
|      324|        107|                4|4.0|4.5|8.87|       1|           0.76|[324.0,107.0,8.87]|
|      316|        104|                3|3.0|3.5| 8.0|       1|           0.72| [316.0,104.0,8.0]|
|      322|        110|                3|3.5|2.5|8.67|       1|            0.8|[322.0,110.0,8.67]|
|      314|        103|                2|2.0|3.0|8.21|       0|           0.65|[314.0,103.0,8.21]|
|      330|        115|                5|4.5|3.0|9.34|       1|            0.9|[330.0,115.0,9.34]|
|      321|        109|                3|3.0|4.0| 8.2|       1|           0.75| [321.0,109.0,8.2]|
|      308

# TASK 5 : Build the Linear Regression Model

In [None]:
#import Linearregression and create final data

from pyspark.ml.regression import LinearRegression

final_data = output_data.select('features', 'Chance of Admit')

In [None]:
#print schema of final data

final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Chance of Admit: double (nullable = true)



In [None]:
#split the dataset into training and testing set

train, test = final_data.randomSplit([0.7,0.3])

In [None]:
#build & train the model

models = LinearRegression(featuresCol = 'features', labelCol = 'Chance of Admit')
model = models.fit(train)

In [None]:
#get coefficients & intercept

print("Coefficients:", model.coefficients)
print('intercept', model.intercept)

Coefficients: [0.0023968651760626113,0.0025979717433670985,0.15253708673623745]
intercept -1.6244508083871094


In [None]:
#get summary of the model


summary = model.summary

In [None]:
#print the rmse & r2 score

print('RMSE', summary.rootMeanSquaredError) # lower the better 
print('r2 score', summary.r2) # higher the better, how close are data fitted to the regression line

RMSE 0.06192123878107236
r2 score 0.8074786792301893


# TASK 6 : Evaluate & Save the Model

In [None]:
#transform on the test data

predictions = model.transform(test)

In [None]:
#display the predictions

predictions.show(15)

+------------------+---------------+------------------+
|          features|Chance of Admit|        prediction|
+------------------+---------------+------------------+
|  [293.0,97.0,7.8]|           0.64|0.5196232238484966|
| [295.0,99.0,7.65]|           0.57|0.5067323346769201|
|[295.0,101.0,7.86]|           0.69|0.5439610663782644|
| [296.0,95.0,7.54]|           0.44|0.4819582333385284|
|  [296.0,97.0,7.8]|           0.49|0.5268138193766843|
|[296.0,101.0,7.68]|            0.6| 0.518901255941804|
| [297.0,100.0,7.9]|           0.52|0.5522583084564718|
| [298.0,92.0,7.88]|           0.51| 0.530820657950873|
| [298.0,98.0,8.03]|           0.34|0.5692890514215112|
|[298.0,105.0,8.54]|           0.69|0.6652687678605618|
| [299.0,96.0,7.86]|           0.54|0.5405586683656793|
|[299.0,100.0,7.88]|           0.51|0.5540012970738724|
|[299.0,100.0,8.02]|           0.63|0.5753564892169456|
| [299.0,106.0,8.4]|           0.64|0.6489084126369182|
| [300.0,95.0,8.22]|           0.62|  0.59527091

In [None]:
#evaluate the model 

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'Chance of Admit', metricName = 'r2')

print('r2 on test data', evaluator.evaluate(predictions))

r2 on test data 0.7929076752527676


In [None]:
#save the model

model.save("model")

In [None]:
#load the model

from pyspark.ml.regression import LinearRegressionModel

model = LinearRegressionModel.load('model')