In [1]:
from pyspark.sql import SparkSession
from urllib.request import urlretrieve
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("ADS project 1")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.executor.memory","4G")
    .config("spark.driver.memory","8G")
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

22/08/22 02:02:38 WARN Utils: Your hostname, Luo resolves to a loopback address: 127.0.1.1; using 172.17.1.121 instead (on interface eth0)
22/08/22 02:02:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/22 02:02:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark.conf.set("spark.sql.parquet.compression.codec", "gzip")

In [3]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sbs
import geopandas as gpd
import folium
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.regression import LinearRegression

In [4]:
FEATURES = ['PULocationID', 'trip_distance', 'fare_amount', 'average_speed', 'time_duration', \
   'Temperature (F)', 'Wind Speed (mph)', 'Pickup_Time', 'Is_Airport', 'Is_Weekend', 'Is_Rainy']
features = 'features'

In [5]:
# vectorise the features that are going to be put into the model
def feature_converter(sdf):
    vecAss = VectorAssembler(
    # features to be used
    inputCols=FEATURES, 
    # name of the output column
    outputCol=features
    )
    df_va = vecAss.transform(sdf)
    return df_va

In [6]:
# act as a pipline before modeling
def preparation(sdf):
    # vectorisation
    sdf = feature_converter(sdf)
    # take only the feature vector and the response
    sdf = sdf.select([features, 'tip_amount'])
    return sdf

In [7]:
# read in the train test dataframe
train_sdf = spark.read.parquet('../data/curated/tlc_data/Model_data.parquet')
test_sdf = spark.read.parquet('../data/curated/tlc_data/Test_data.parquet')
# pipline the dataframes
train_sdf = preparation(train_sdf)
test_sdf = preparation(test_sdf)

                                                                                

In [8]:
# discretisation the Tip amount into labels of High and Low
from pyspark.sql.types import IntegerType
@F.udf(IntegerType())
def discretisation(tip):
    if tip <= 2:
        return 0
    else:
        return 1


In [9]:
# apply the discretisation
train_sdf = train_sdf.withColumn('label', discretisation(F.col('tip_amount')))
test_sdf = test_sdf.withColumn('label', discretisation(F.col('tip_amount')))

In [10]:
# baseline accuracy
train_sdf.where(F.col('label') == 1).count()/train_sdf.count()

                                                                                

0.6310719090384228

In [11]:
from pyspark.ml.classification import LogisticRegression

#Apply the logistic regression model

log_reg=LogisticRegression(labelCol='label', featuresCol = features, elasticNetParam=1, family="binomial").fit(train_sdf)
result = log_reg.transform(test_sdf)

                                                                                

In [12]:
# define the values for error analysis
TN = result.where((F.col('label') == 0) & (F.col('prediction') == 0)).count()
FP = result.where((F.col('label') == 0) & (F.col('prediction') == 1)).count()
FN = result.where((F.col('label') == 1) & (F.col('prediction') == 0)).count()
TP = result.where((F.col('label') == 1) & (F.col('prediction') == 1)).count()

                                                                                

In [13]:
# produce evaluation scores
acc = (TN + TP)/(TN + TP + FN + FP)
recall = TP/(TP + FN)
precision = TP / (TP + FP)
F1 = 2*recall*precision / (recall + precision)
print(f'The Accuracy is: {acc}')
print(f'The Recall is: {recall}')
print(f'The Precision is: {precision}')
print(f'The F1 score is: {F1}')

The Accuracy is: 0.7619080133355771
The Recall is: 0.8640002122198863
The Precision is: 0.7889589351355607
The F1 score is: 0.824776206333329
