In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 49.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=d7d90125698638aea8b1a7ecace8d551eeec75fc3ad867193d61651234ceebcd
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


# **Installation of library**

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# **Implement of spark**

In [5]:
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("LinearRegWithSpark")\
        .getOrCreate()

# **Upload dataset**

In [6]:
dataset = spark.read.csv("Admission_Prediction.csv",header=True)

dataset.show()

+---------+-----------+-----------------+----+----+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating| SOP| LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+----+----+----+--------+---------------+
|   337.00|     118.00|                4|4.50|4.50|9.65|    1.00|           0.92|
|   324.00|     107.00|                4|4.00|4.50|8.87|    1.00|           0.76|
|     null|     104.00|                3|3.00|3.50|8.00|    1.00|           0.72|
|   322.00|     110.00|                3|3.50|2.50|8.67|    1.00|           0.80|
|   314.00|     103.00|                2|2.00|3.00|8.21|    0.00|           0.65|
|   330.00|     115.00|                5|4.50|3.00|9.34|    1.00|           0.90|
|   321.00|     109.00|             null|3.00|4.00|8.20|    1.00|           0.75|
|   308.00|     101.00|                2|3.00|4.00|7.90|    0.00|           0.68|
|   302.00|     102.00|                1|2.00|1.50|8.00|    0.00|           0.50|
|   323.00|     

# **Print Scheme of dataset**

In [7]:
dataset.printSchema()

root
 |-- GRE Score: string (nullable = true)
 |-- TOEFL Score: string (nullable = true)
 |-- University Rating: string (nullable = true)
 |-- SOP: string (nullable = true)
 |-- LOR: string (nullable = true)
 |-- CGPA: string (nullable = true)
 |-- Research: string (nullable = true)
 |-- Chance of Admit: string (nullable = true)



# **Create all column in float value**

In [8]:
from pyspark.sql.functions import col
new_data = dataset.select(*(col(c).cast("float").alias(c) for c in dataset.columns))

new_data.printSchema()

root
 |-- GRE Score: float (nullable = true)
 |-- TOEFL Score: float (nullable = true)
 |-- University Rating: float (nullable = true)
 |-- SOP: float (nullable = true)
 |-- LOR: float (nullable = true)
 |-- CGPA: float (nullable = true)
 |-- Research: float (nullable = true)
 |-- Chance of Admit: float (nullable = true)



# **Check for missing values**

In [9]:
from pyspark.sql.functions import col, count, isnan, when

new_data.select([count(when(col(c).isNull(), c)).alias(c) for c in new_data.columns]).show()

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|       15|         10|               15|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



# **Select features for model**

In [10]:
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=["GRE Score", "TOEFL Score","University Rating"],outputCols=["GRE Score", "TOEFL Score","University Rating"])
model = imputer.fit(new_data)

In [11]:
imputed_data = model.transform(new_data)

imputed_data.select([count(when(col(c).isNull(), c)).alias(c) for c in imputed_data.columns]).show()

features = imputed_data.drop('Chance of Admit')

+---------+-----------+-----------------+---+---+----+--------+---------------+
|GRE Score|TOEFL Score|University Rating|SOP|LOR|CGPA|Research|Chance of Admit|
+---------+-----------+-----------------+---+---+----+--------+---------------+
|        0|          0|                0|  0|  0|   0|       0|              0|
+---------+-----------+-----------------+---+---+----+--------+---------------+



# **let's assemble our features together using vectorAssembler**

In [12]:
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")

output = assembler.transform(imputed_data)

output= output.select("features", "Chance of Admit")

In [13]:
output

DataFrame[features: vector, Chance of Admit: float]

# **Spliting the dataset into train and test**

In [14]:
train_df,test_df = output.randomSplit([0.7, 0.3])

train_df.show()
test_df.show()

+--------------------+---------------+
|            features|Chance of Admit|
+--------------------+---------------+
|[290.0,100.0,1.0,...|           0.47|
|[293.0,97.0,2.0,2...|           0.64|
|[294.0,93.0,1.0,1...|           0.46|
|[294.0,95.0,1.0,1...|           0.49|
|[295.0,93.0,1.0,2...|           0.46|
|[295.0,99.0,1.0,2...|           0.37|
|[295.0,99.0,2.0,2...|           0.57|
|[296.0,97.0,2.0,1...|           0.49|
|[296.0,99.0,2.0,2...|           0.61|
|[296.0,99.0,2.0,3...|           0.47|
|[297.0,96.0,2.0,2...|           0.43|
|[297.0,96.0,2.0,2...|           0.34|
|[297.0,100.0,1.0,...|           0.52|
|[297.0,101.0,3.0,...|           0.57|
|[298.0,97.0,3.121...|           0.45|
|[298.0,98.0,2.0,1...|           0.44|
|[298.0,98.0,2.0,4...|           0.34|
|[298.0,99.0,1.0,1...|           0.53|
|[298.0,100.0,3.0,...|           0.58|
|[298.0,101.0,4.0,...|           0.53|
+--------------------+---------------+
only showing top 20 rows

+--------------------+---------------+

# **Implementation of LinearRegression model**

In [15]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='Chance of Admit')
linear_model = lin_reg.fit(train_df)


print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

Coefficients: [0.002163935333450153,0.00280665512871198,0.006503964758955885,-0.0020538574501859125,0.017607689933074024,0.12492246179102301,0.02959748201314739]
Intercept: -1.430375773248933


# **Evaluation of model using R-square and RMSE**

In [16]:
trainSummary = linear_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("r2: %f" % trainSummary.r2)

RMSE: 0.058038
r2: 0.843091


# **Predictions**

In [17]:
predictions = linear_model.transform(test_df)
predictions.select("prediction","Chance of Admit","features").show()


+-------------------+---------------+--------------------+
|         prediction|Chance of Admit|            features|
+-------------------+---------------+--------------------+
|0.48690654553223944|           0.45|[290.0,104.0,4.0,...|
| 0.4394974542917569|           0.47|[295.0,96.0,2.0,1...|
|  0.516436550233756|           0.69|[295.0,101.0,2.0,...|
| 0.4903558988654997|           0.44|[296.0,95.0,2.0,3...|
| 0.5072181294956302|            0.6|[296.0,101.0,1.0,...|
| 0.5062168705591794|           0.59|[297.0,98.0,2.0,2...|
| 0.5472974994190425|           0.54|[297.0,99.0,4.0,3...|
| 0.4946898708949492|           0.51|[298.0,92.0,1.0,2...|
| 0.5249822136842923|           0.54|[298.0,101.0,2.0,...|
| 0.5047345866412012|           0.46|[298.0,107.187751...|
| 0.4194551594011964|           0.42|[299.0,94.0,1.0,1...|
| 0.5498041643154739|           0.63|[299.0,100.0,3.0,...|
| 0.5921892714123218|           0.65|[300.0,97.0,2.0,3...|
| 0.5421507048401251|           0.61|[300.0,98.0,1.0,2..

# **R Squared on test data**

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Chance of Admit",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.74913


In [19]:
spark.stop()