# Basic Understanding of Pyspark

In [None]:
import pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [None]:
# spark.version
spark

In [None]:
df_pyspark = spark.read.csv('/FileStore/tables/test1.csv', header=True, inferSchema=True)

In [None]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [None]:
type(df_pyspark)

Out[8]: pyspark.sql.dataframe.DataFrame

In [None]:
df_pyspark.columns

Out[9]: ['Name', 'age', 'Experience', 'Salary']

In [None]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



# Basic Functions:
1. Selecting Column and Indexing
2. Adding Columns
3. Dropping Columns
4. Renaming Columns


In [None]:
df_pyspark.head(3)

Out[12]: [Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [None]:
df_pyspark.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [None]:
df_pyspark.select('Name')

Out[14]: DataFrame[Name: string]

In [None]:
df_pyspark.select('age').show()

+---+
|age|
+---+
| 31|
| 30|
| 29|
| 24|
| 21|
| 23|
+---+



In [None]:
df_pyspark.select('Experience').show()

+----------+
|Experience|
+----------+
|        10|
|         8|
|         4|
|         3|
|         1|
|         2|
+----------+



In [None]:
df_pyspark.select(['Name', 'age']).show()

+---------+---+
|     Name|age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
|     Paul| 24|
|   Harsha| 21|
|  Shubham| 23|
+---------+---+



In [None]:
df_pyspark['Name']

Out[18]: Column<'Name'>

In [None]:
df_pyspark.dtypes

Out[19]: [('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [None]:
df_pyspark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [None]:
df_pyspark.describe()

Out[21]: DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

In [None]:
# Adding Columns 
df_pyspark = df_pyspark.withColumn("After 2 Years", df_pyspark['Experience'] +  2)

In [None]:
df_pyspark.show()

+---------+---+----------+------+-------------+
|     Name|age|Experience|Salary|After 2 Years|
+---------+---+----------+------+-------------+
|    Krish| 31|        10| 30000|           12|
|Sudhanshu| 30|         8| 25000|           10|
|    Sunny| 29|         4| 20000|            6|
|     Paul| 24|         3| 20000|            5|
|   Harsha| 21|         1| 15000|            3|
|  Shubham| 23|         2| 18000|            4|
+---------+---+----------+------+-------------+



In [None]:
df_pyspark.select('After 2 Years').show()

+-------------+
|After 2 Years|
+-------------+
|           12|
|           10|
|            6|
|            5|
|            3|
|            4|
+-------------+



In [None]:
# Drop a Column
df_pyspark = df_pyspark.drop('After 2 Years')

In [None]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [None]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [None]:
df_pyspark =df_pyspark.withColumnRenamed('age', 'Age')

In [None]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



# Pyspark Handling Missing Values
1. Dropping Columns
2. Dropping Rows
3. Various Parameter in Dropping Functionalities
4. Handlling Missing Values by Mean/ Mode/ Median

In [None]:
df_pyspark1 = spark.read.csv('/FileStore/tables/missing_values.csv', header=True, inferSchema=True)

In [None]:
df_pyspark1.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [None]:
df_pyspark1.describe()

Out[35]: DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

In [None]:
df_pyspark1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [None]:
df_pyspark1_dummy = spark.read.csv('/FileStore/tables/missing_values.csv', header=True, inferSchema=True)

In [None]:
df_pyspark1_dummy.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [None]:
df_pyspark1_dummy.na.drop(how='any', thresh=2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
+---------+----+----------+------+



In [None]:
df_pyspark1_dummy.na.fill('M V', ['Experience', 'Salary']).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|null|      null| 40000|
|     null|  34|        10| 38000|
|     null|  36|      null|  null|
+---------+----+----------+------+



In [None]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'], 
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("mean")

In [None]:
imputer.fit(df_pyspark1_dummy).transform(df_pyspark1_dummy).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|null|      null| 40000|         28|                 5|         40000|
|     null|  34|        10| 38000|         34|                10|         38000|
|     null|  36|      null|  null|         36|                 5|         25750|
+---------+----+----------+-

In [None]:
# The Imputer class in PySpark is a feature transformer used to handle missing values in a DataFrame.
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'], 
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("median")

In [None]:
# df_pyspark1_dummy.show()

# Pyspark DataFrame Operations
 1. Filter Operations
 2. &, |, ==
 3. ~

In [None]:
df_pyspark.filter('Salary <= 20000').show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [None]:
df_pyspark.filter('Salary <= 20000').select(['Name', 'age']).show()

+-------+---+
|   Name|age|
+-------+---+
|  Sunny| 29|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [None]:
df_pyspark.filter((df_pyspark['Salary'] <= 20000) 
                | (df_pyspark['Salary'] >= 15000)).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [None]:
df_pyspark.filter(~(df_pyspark['Salary'] <= 20000)).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
+---------+---+----------+------+



In [None]:
# 1. Groupby
# 2. Aggregate

In [None]:
df_pyspark_gby = spark.read.csv('/FileStore/tables/grouby_learn.csv', header=True, inferSchema=True) 

In [None]:
df_pyspark_gby.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [None]:
df_pyspark_gby.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- salary: integer (nullable = true)



In [None]:
df_pyspark_gby.groupBy('Name')

Out[60]: <pyspark.sql.group.GroupedData at 0x7f98b094f5e0>

In [None]:
df_pyspark_gby.groupBy('Name').sum().show()

+---------+-----------+
|     Name|sum(salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [None]:
df_pyspark_gby.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+



In [None]:
df_pyspark_gby.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [None]:
df_pyspark_gby.agg({'Salary': 'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



# Project in ML(Linear Regression)

In [None]:
df_ml = spark.read.csv('/FileStore/tables/tips_data.csv', header=True, inferSchema=True)

In [None]:
df_ml.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [None]:
df_ml.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [None]:
df_ml.columns

Out[72]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [None]:
# Handling String Categorical Values/ Features

from pyspark.ml.feature import StringIndexer

In [None]:
indexer = StringIndexer(inputCols=['sex', 'smoker', 'day','time'], outputCols=['sex_indexed', 'smoker_indexed','day_indexed', 'time_indexed'])

df_ml_dummy = indexer.fit(df_ml).transform(df_ml)

df_ml_dummy.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


In [None]:
df_ml_dummy.select(['sex_indexed', 'day_indexed','time_indexed', 'smoker_indexed']).show()

+-----------+-----------+------------+--------------+
|sex_indexed|day_indexed|time_indexed|smoker_indexed|
+-----------+-----------+------------+--------------+
|        1.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        1.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        1.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|         0.0|           0.0|
|        1.0|        1.0|         0.0|           0.0|
|        0.0|        1.0|   

In [None]:
df_ml_dummy.select(['sex', 'sex_indexed']).show()

+------+-----------+
|   sex|sex_indexed|
+------+-----------+
|Female|        1.0|
|  Male|        0.0|
|  Male|        0.0|
|  Male|        0.0|
|Female|        1.0|
|  Male|        0.0|
|  Male|        0.0|
|  Male|        0.0|
|  Male|        0.0|
|  Male|        0.0|
|  Male|        0.0|
|Female|        1.0|
|  Male|        0.0|
|  Male|        0.0|
|Female|        1.0|
|  Male|        0.0|
|Female|        1.0|
|  Male|        0.0|
|Female|        1.0|
|  Male|        0.0|
+------+-----------+
only showing top 20 rows



In [None]:
df_ml_dummy.select(['smoker', 'smoker_indexed']).show()

+------+--------------+
|smoker|smoker_indexed|
+------+--------------+
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
|    No|           0.0|
+------+--------------+
only showing top 20 rows



In [None]:
df_ml_dummy.select(['time', 'time_indexed']).show()

+------+------------+
|  time|time_indexed|
+------+------------+
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
|Dinner|         0.0|
+------+------------+
only showing top 20 rows



In [None]:
df_ml_dummy.select(['day', 'day_indexed']).show()

+---+-----------+
|day|day_indexed|
+---+-----------+
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sun|        1.0|
|Sat|        0.0|
+---+-----------+
only showing top 20 rows



In [None]:
df_ml_dummy.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


In [None]:
df_ml_dummy.columns

Out[85]: ['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sex_indexed',
 'smoker_indexed',
 'day_indexed',
 'time_indexed']

In [None]:
# The VectorAssembler class in PySpark is a feature transformer that combines multiple columns of a DataFrame into a single vector column.
from pyspark.ml.feature import VectorAssembler

featureassemblers = VectorAssembler(inputCols = ['tip', 'size','sex_indexed', 'smoker_indexed','day_indexed','time_indexed'], outputCol = "Independent Features")

output = featureassemblers.transform(df_ml_dummy)

In [None]:
# output.show()
output.select('Independent Features').show()

+--------------------+
|Independent Features|
+--------------------+
|[1.01,2.0,1.0,0.0...|
|[1.66,3.0,0.0,0.0...|
|[3.5,3.0,0.0,0.0,...|
|[3.31,2.0,0.0,0.0...|
|[3.61,4.0,1.0,0.0...|
|[4.71,4.0,0.0,0.0...|
|[2.0,2.0,0.0,0.0,...|
|[3.12,4.0,0.0,0.0...|
|[1.96,2.0,0.0,0.0...|
|[3.23,2.0,0.0,0.0...|
|[1.71,2.0,0.0,0.0...|
|[5.0,4.0,1.0,0.0,...|
|[1.57,2.0,0.0,0.0...|
|[3.0,4.0,0.0,0.0,...|
|[3.02,2.0,1.0,0.0...|
|[3.92,2.0,0.0,0.0...|
|[1.67,3.0,1.0,0.0...|
|[3.71,3.0,0.0,0.0...|
|[3.5,3.0,1.0,0.0,...|
|(6,[0,1],[3.35,3.0])|
+--------------------+
only showing top 20 rows



In [None]:
finalized_data = output.select("Independent Features", 'total_bill')

In [None]:
finalized_data.show()

+--------------------+----------+
|Independent Features|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|[1.66,3.0,0.0,0.0...|     10.34|
|[3.5,3.0,0.0,0.0,...|     21.01|
|[3.31,2.0,0.0,0.0...|     23.68|
|[3.61,4.0,1.0,0.0...|     24.59|
|[4.71,4.0,0.0,0.0...|     25.29|
|[2.0,2.0,0.0,0.0,...|      8.77|
|[3.12,4.0,0.0,0.0...|     26.88|
|[1.96,2.0,0.0,0.0...|     15.04|
|[3.23,2.0,0.0,0.0...|     14.78|
|[1.71,2.0,0.0,0.0...|     10.27|
|[5.0,4.0,1.0,0.0,...|     35.26|
|[1.57,2.0,0.0,0.0...|     15.42|
|[3.0,4.0,0.0,0.0,...|     18.43|
|[3.02,2.0,1.0,0.0...|     14.83|
|[3.92,2.0,0.0,0.0...|     21.58|
|[1.67,3.0,1.0,0.0...|     10.33|
|[3.71,3.0,0.0,0.0...|     16.29|
|[3.5,3.0,1.0,0.0,...|     16.97|
|(6,[0,1],[3.35,3.0])|     20.65|
+--------------------+----------+
only showing top 20 rows



In [None]:
from pyspark.ml.regression import LinearRegression

# Train-test Split

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

regressor = LinearRegression(featuresCol = 'Independent Features', labelCol = 'total_bill')

regressor= regressor.fit(train_data)

In [None]:
regressor.coefficients

Out[99]: DenseVector([3.1315, 3.3785, -0.6547, 1.9568, 0.1665, -1.3815])

In [None]:
regressor.intercept

Out[100]: 1.430044825895874

In [None]:
# Predictions
pred_res = regressor.evaluate(test_data)

In [None]:
pred_res.predictions.show()

+--------------------+----------+------------------+
|Independent Features|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.75,2.0])|     17.82|13.667115180455099|
| (6,[0,1],[2.0,2.0])|     12.69|14.449990146602246|
| (6,[0,1],[4.3,2.0])|      21.7| 21.65243983515602|
|(6,[0,1],[6.73,4.0])|     48.27|  36.0189300976355|
|(6,[0,1],[7.58,4.0])|     39.42|  38.6807049825358|
|[1.17,2.0,0.0,1.0...|     32.83| 13.80762598142978|
|[1.32,2.0,0.0,0.0...|      9.68|12.487056262386965|
|[1.36,3.0,1.0,0.0...|     18.64|14.121111062595727|
|[1.5,2.0,0.0,1.0,...|     12.03|15.340479007858907|
|[1.56,2.0,0.0,0.0...|      9.94| 13.23861622988823|
|[1.66,3.0,0.0,0.0...|     10.34|16.930239012111677|
|[1.71,2.0,0.0,0.0...|     10.27|13.708341209576517|
|[1.8,2.0,1.0,0.0,...|     12.43|12.120498207250119|
|[1.92,1.0,0.0,1.0...|      8.58| 11.89574885648015|
|[2.0,2.0,0.0,0.0,...|     13.03|13.401474895270798|
|[2.0,2.0,0.0,1.0,...|     14.48|16.5732568927