In [1]:
import os
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc
from pyspark.sql import functions as F
from pyspark.ml.feature import CountVectorizer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression

In [2]:
head, tail = os.path.split(os.getcwd())
data_dir = os.path.join(head, 'data')
data_raw_dir = os.path.join(data_dir, 'raw')
DATA_INTERIM_DIR = os.path.join(data_dir, 'interim')
DATA_PROCESSED_DIR = os.path.join(data_dir, 'processed')
data_raw_dir

'/Users/Gagandeep/Desktop/Concordia/12 Winter 2021/SOEN 471/NoToW/data/raw'

In [5]:
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [6]:
spark = init_spark()

In [9]:
clean_data = spark.read.parquet(os.path.join(DATA_PROCESSED_DIR, 'cleaned.data'))
clean_data.head(2)

[Row(DATE_ORIGINE='2019-01-10', LONGITUDE_ORIGINE=-73.60370215594651, LATITUDE_ORIGINE=45.5590991399067, Distance_km=0.0484305337462778, MOTIF_REMORQUAGE="Constat d'infraction", Date_Time='2019-01-10', Year=2019.0, Month=1.0, Day=10.0, Mean_Temp=-7.9, Total_Rain=0.0, Total_Precip=0.2, Total_Snow=0.0, Spd_of_Max_Gust=48.0),
 Row(DATE_ORIGINE='2019-01-10', LONGITUDE_ORIGINE=-73.599733540326, LATITUDE_ORIGINE=45.453884709812606, Distance_km=0.07233637828159714, MOTIF_REMORQUAGE="Constat d'infraction", Date_Time='2019-01-10', Year=2019.0, Month=1.0, Day=10.0, Mean_Temp=-7.9, Total_Rain=0.0, Total_Precip=0.2, Total_Snow=0.0, Spd_of_Max_Gust=48.0)]

In [None]:
clean_data.printSchema()

In [None]:
clean_data.isNull().count()

In [10]:
features = clean_data.drop('DATE_ORIGINE','Distance_km','MOTIF_REMORQUAGE','Date_Time','Year', 'Spd_of_Max_Gust')
print(features.columns)
features.show(5)

['LONGITUDE_ORIGINE', 'LATITUDE_ORIGINE', 'Month', 'Day', 'Mean_Temp', 'Total_Rain', 'Total_Precip', 'Total_Snow']
+------------------+------------------+-----+----+---------+----------+------------+----------+
| LONGITUDE_ORIGINE|  LATITUDE_ORIGINE|Month| Day|Mean_Temp|Total_Rain|Total_Precip|Total_Snow|
+------------------+------------------+-----+----+---------+----------+------------+----------+
|-73.60370215594651|  45.5590991399067|  1.0|10.0|     -7.9|       0.0|         0.2|       0.0|
|  -73.599733540326|45.453884709812606|  1.0|10.0|     -7.9|       0.0|         0.2|       0.0|
| -73.5575876924943|45.531104742785395|  1.0|10.0|     -7.9|       0.0|         0.2|       0.0|
| -73.5599771223213|   45.594984152964|  1.0|10.0|     -7.9|       0.0|         0.2|       0.0|
|-73.62714397061728|  45.4186077928051|  1.0|10.0|     -7.9|       0.0|         0.2|       0.0|
+------------------+------------------+-----+----+---------+----------+------------+----------+
only showing top 5 ro

In [None]:
for col_name in features.schema.names:
    if features.filter(features[col_name].isNull()).count() > 0:
        print(col_name)

In [11]:
vectorAssembler = VectorAssembler(inputCols = features.columns, outputCol = 'features')
training_df = vectorAssembler.transform(clean_data)
training_df = training_df.select(['features', 'Distance_km'])
training_df.show(5)

+--------------------+--------------------+
|            features|         Distance_km|
+--------------------+--------------------+
|[-73.603702155946...|  0.0484305337462778|
|[-73.599733540326...| 0.07233637828159714|
|[-73.557587692494...|  0.3967286084324585|
|[-73.559977122321...|  0.2499872949122831|
|[-73.627143970617...|0.047350695587300755|
+--------------------+--------------------+
only showing top 5 rows



In [12]:
print(type(training_df))
train,test = training_df.randomSplit([0.75, 0.25])

<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
test.show(1)

In [13]:
lr = LinearRegression(featuresCol = 'features', labelCol='Distance_km')
lr_model = lr.fit(train)

In [14]:
print("Coefficients: " + str(lr_model.coefficients))
print("\nIntercept: " + str(lr_model.intercept))

Coefficients: [-0.30150542959901183,0.1328908652697921,-0.0019920841747191106,0.00020015526289856626,-0.00060211949497434,0.0,0.00029682119789491704,0.0]

Intercept: -27.957594402754818


In [15]:
trainSummary = lr_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)

RMSE: 0.362298

r2: 0.001919


In [16]:
lr2 = LinearRegression(featuresCol = 'features', labelCol='Distance_km',maxIter=1000, regParam=0.12, elasticNetParam=0.2)
lr_model2 = lr.fit(train)
print("Coefficients: " + str(lr_model2.coefficients))
print("\nIntercept: " + str(lr_model2.intercept))
trainSummary = lr_model2.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)

Coefficients: [-0.30150542959901183,0.1328908652697921,-0.0019920841747191106,0.00020015526289856626,-0.00060211949497434,0.0,0.00029682119789491704,0.0]

Intercept: -27.957594402754818
RMSE: 0.362298

r2: 0.001919


In [18]:
from  pyspark.sql.functions import abs
predictions = lr_model.transform(test)
x =((predictions['Distance_km']-predictions['prediction'])/predictions['Distance_km'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
predictions.select("prediction","Distance_km","Accuracy","features").show()

+-------------------+--------------------+------------------+--------------------+
|         prediction|         Distance_km|          Accuracy|            features|
+-------------------+--------------------+------------------+--------------------+
|0.31948522894599307| 0.07774081929023328| 310.9620040833948|(8,[0,1,2,3],[-73...|
|  0.319004030864086| 0.32676351753179644| 2.374649020282802|(8,[0,1,2,3],[-73...|
|  0.301166570206032| 0.04211591656085911| 615.0896734512112|(8,[0,1,2,3],[-73...|
|  0.298204122527828| 0.06608208888522123|351.26316004595793|(8,[0,1,2,3],[-73...|
|  0.316183658437172|0.005464875812922718| 5685.742791986174|(8,[0,1,2,3],[-73...|
| 0.2975068433572474|                 0.0|              null|(8,[0,1,2,3],[-73...|
| 0.2975068433572474|  0.7323478551560163| 59.37629348366593|(8,[0,1,2,3],[-73...|
|0.29709414832123215| 0.20156398653590432|    47.39445941069|(8,[0,1,2,3],[-73...|
|0.29709414832123215|  0.6283447658479964|52.717971968735625|(8,[0,1,2,3],[-73...|
| 0.

In [1]:
results = lr_model.evaluate(test)

NameError: name 'lr_model' is not defined