In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName='Used Cars')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Used Car Capstone").getOrCreate()
import pyspark.sql.functions as psf

In [80]:
usedCarDF = spark.read.load("result_final.csv", format="csv", sep=",", inferSchema="true", header="true")

In [81]:
usedCarDF.select("NAME","TRANSMISSION").show()

+--------------------+------------+
|                NAME|TRANSMISSION|
+--------------------+------------+
|Maruti Swift Dzir...|      Manual|
|Skoda Rapid 1.5 T...|      Manual|
|Honda City 2017-2...|      Manual|
|Hyundai i20 Sport...|      Manual|
|Maruti Swift VXI ...|      Manual|
|Hyundai Xcent 1.2...|      Manual|
|Maruti Wagon R LX...|      Manual|
|  Maruti 800 DX BSII|      Manual|
|    Toyota Etios VXD|      Manual|
|Ford Figo Diesel ...|      Manual|
|Renault Duster 11...|      Manual|
|       Maruti Zen LX|      Manual|
|Maruti Swift Dzir...|      Manual|
|Maruti Wagon R LX...|      Manual|
|Mahindra KUV 100 ...|      Manual|
|Maruti Ertiga SHV...|      Manual|
|Hyundai i20 1.4 C...|      Manual|
|      Maruti Alto LX|      Manual|
|Hyundai i20 2015-...|      Manual|
|Mahindra Verito 1...|      Manual|
+--------------------+------------+
only showing top 20 rows



In [82]:
usedCarDF.corr("SELLING_PRICE","YEAR")

0.4123015581711755

In [83]:
usedCarDF.cov("SELLING_PRICE","YEAR")

1296043.5082286762

In [84]:
usedCarDF.explain()

== Physical Plan ==
FileScan csv [SALES_ID#1680,NAME#1681,YEAR#1682,SELLING_PRICE#1683,KM_DRIVEN#1684,CITY_CODE#1685,STATE_CODE#1686,POSTAL_CODE#1687,FUEL#1688,SELLER_TYPE#1689,TRANSMISSION#1690,OWNER#1691,MILEAGE#1692,ENGINE#1693,MAX_POWER#1694,SEATS#1695,MILEAGE_UNIT#1696,MILEAGE_NO#1697,STATE_NAME#1698,CITY_NAME#1699,SOLD#1700,REGION#1701] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/C:/Users/avi.koyani/OneDrive/Cape_Stone/Jupyter/result_final.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<SALES_ID:int,NAME:string,YEAR:int,SELLING_PRICE:int,KM_DRIVEN:int,CITY_CODE:string,STATE_C...




In [85]:
usedCarDF.printSchema()

root
 |-- SALES_ID: integer (nullable = true)
 |-- NAME: string (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- SELLING_PRICE: integer (nullable = true)
 |-- KM_DRIVEN: integer (nullable = true)
 |-- CITY_CODE: string (nullable = true)
 |-- STATE_CODE: string (nullable = true)
 |-- POSTAL_CODE: integer (nullable = true)
 |-- FUEL: string (nullable = true)
 |-- SELLER_TYPE: string (nullable = true)
 |-- TRANSMISSION: string (nullable = true)
 |-- OWNER: string (nullable = true)
 |-- MILEAGE: string (nullable = true)
 |-- ENGINE: string (nullable = true)
 |-- MAX_POWER: string (nullable = true)
 |-- SEATS: integer (nullable = true)
 |-- MILEAGE_UNIT: string (nullable = true)
 |-- MILEAGE_NO: double (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- CITY_NAME: string (nullable = true)
 |-- SOLD: string (nullable = true)
 |-- REGION: string (nullable = true)



In [86]:
Split_Power = psf.split(usedCarDF["Max_Power"]," ")

In [87]:
usedCarDF = usedCarDF.withColumn("Max_Power",Split_Power.getItem(0).cast("integer"))

In [88]:
import pyspark.sql.functions as F

In [89]:
fuel = usedCarDF.select("FUEL").distinct().rdd.flatMap(lambda x: x).collect()
transmission = usedCarDF.select("TRANSMISSION").distinct().rdd.flatMap(lambda x: x).collect()
fuelType = [F.when(F.col("FUEL") == ty, 1).otherwise(0).alias("" + ty) for ty in fuel]
transmissionType = [F.when(F.col("TRANSMISSION") == code, 1).otherwise(0).alias("" + code) for code in transmission]
usedCarDF = usedCarDF.select("SELLING_PRICE", "YEAR", "KM_DRIVEN","MAX_POWER","SEATS","MILEAGE_NO","SOLD",*fuelType+transmissionType)
usedCarDF_Sold = usedCarDF.filter(usedCarDF.SOLD=="Y").drop("SOLD")
usedCarDF_unSold=usedCarDF.filter(usedCarDF.SOLD=="N").drop("SOLD")
usedCarDF_Sold.show()

+-------------+----+---------+---------+-----+----------+------+---+---+------+---------+------+
|SELLING_PRICE|YEAR|KM_DRIVEN|MAX_POWER|SEATS|MILEAGE_NO|Diesel|CNG|LPG|Petrol|Automatic|Manual|
+-------------+----+---------+---------+-----+----------+------+---+---+------+---------+------+
|       450000|2014|   145500|       74|    5|      23.4|     1|  0|  0|     0|        0|     1|
|       370000|2014|   120000|      103|    5|     21.14|     1|  0|  0|     0|        0|     1|
|       158000|2006|   140000|       78|    5|      17.7|     0|  0|  0|     1|        0|     1|
|       225000|2010|   127000|       90|    5|      23.0|     1|  0|  0|     0|        0|     1|
|       130000|2007|   120000|       88|    5|      16.1|     0|  0|  0|     1|        0|     1|
|       440000|2017|    45000|       81|    5|     20.14|     0|  0|  0|     1|        0|     1|
|        96000|2007|   175000|       57|    5|      17.3|     0|  0|  1|     0|        0|     1|
|        45000|2001|     5000|

In [90]:
from pyspark.sql.functions import col

In [91]:
usedCarDF.printSchema()

root
 |-- SELLING_PRICE: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- KM_DRIVEN: integer (nullable = true)
 |-- MAX_POWER: integer (nullable = true)
 |-- SEATS: integer (nullable = true)
 |-- MILEAGE_NO: double (nullable = true)
 |-- SOLD: string (nullable = true)
 |-- Diesel: integer (nullable = false)
 |-- CNG: integer (nullable = false)
 |-- LPG: integer (nullable = false)
 |-- Petrol: integer (nullable = false)
 |-- Automatic: integer (nullable = false)
 |-- Manual: integer (nullable = false)



In [92]:
from pyspark.ml.stat import Correlation

In [93]:
from pyspark.ml.feature import VectorAssembler

In [94]:
vector_col = "corr_features"

In [95]:
usedCarDF.columns

['SELLING_PRICE',
 'YEAR',
 'KM_DRIVEN',
 'MAX_POWER',
 'SEATS',
 'MILEAGE_NO',
 'SOLD',
 'Diesel',
 'CNG',
 'LPG',
 'Petrol',
 'Automatic',
 'Manual']

In [103]:
column_list= usedCarDF_Sold.columns


In [104]:
column_list

['SELLING_PRICE',
 'YEAR',
 'KM_DRIVEN',
 'MAX_POWER',
 'SEATS',
 'MILEAGE_NO',
 'Diesel',
 'CNG',
 'LPG',
 'Petrol',
 'Automatic',
 'Manual']

In [113]:
for i in column_list:
    print(i,usedCarDF_Sold.corr("SELLING_PRICE",i))

SELLING_PRICE 1.0
YEAR 0.41633780251166896
KM_DRIVEN -0.23536376181512805
MAX_POWER 0.7787732815949355
SEATS 0.009365095128508102
MILEAGE_NO -0.10077194169192169
Diesel 0.19098213504877337
CNG -0.021031438766946558
LPG -0.02943889643545322
Petrol -0.18541170589683892
Automatic 0.6030166122876204
Manual -0.6030166122876205


In [114]:
for i in column_list:
    print(i,usedCarDF_unSold.corr("SELLING_PRICE",i))

SELLING_PRICE 1.0
YEAR 0.41195436159198523
KM_DRIVEN -0.21767236571895754
MAX_POWER 0.7389628647884097
SEATS 0.05384188847352593
MILEAGE_NO -0.1355470382306577
Diesel 0.21114703476810504
CNG -0.0366968418394256
LPG -0.0381722432806315
Petrol -0.19993798762929735
Automatic 0.5853797441685367
Manual -0.5853797441685367


In [None]:
from pyspark.ml.stat import Correlation


In [None]:
Correlation.corr(usedCarDF,"SELLING_PRICE")