# MAST30034_Applied Data Science_Project1

## Import Libraries

In [35]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
from geodatasets import get_path
import pandas as pd
import geopandas as gpd
from pyspark.sql.functions import date_format, hour, dayofweek
import seaborn as sns
from scipy.stats import chi2_contingency
from pyspark.sql.functions import count as spark_count
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [6]:
# Create a spark session
spark = (
SparkSession.builder.appName("ADS project 1")
.config("spark.sql.repl.eagerEval.enabled", True)
.config("spark.driver.memory","6G")
.config("spark.executor.memory","6G")
.config("spark.sql.parquet.cacheMetadata", "true")
.getOrCreate()
)

24/08/22 16:27:26 WARN Utils: Your hostname, chumuhandeMacBook-Air-2.local resolves to a loopback address: 127.0.0.1; using 10.13.132.182 instead (on interface en0)
24/08/22 16:27:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/22 16:27:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read the data

In [26]:
taxi_data = spark.read.parquet('../data/after_EDA_data_Engineering/taxi').toPandas()
citybike_data = spark.read.parquet('../data/after_EDA_data_Engineering/citybike').toPandas()

## One hot encoding

In [29]:
def one_hot_encode(df, columns):
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    df_encoded = df.copy()
    
    for column in columns:
        encoded_data = encoder.fit_transform(df_encoded[[column]])
        encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out([column]))
        df_encoded = pd.concat([df_encoded.drop(columns=[column]), encoded_df], axis=1)
    
    return df_encoded

In [30]:
taxi_data = one_hot_encode(taxi_data, ['hour', 'icon', 'preciptype', 'day_of_week'])

In [12]:
citybike_data  =  one_hot_encode(citybike_data, ['hour'])

In [61]:
taxi_data.head(10)

Unnamed: 0,temp,humidity,windgust,visibility,solarenergy,taxi_count,hour_1,hour_2,hour_3,hour_4,...,icon_snow,preciptype_rain,"preciptype_rain,snow",preciptype_snow,day_of_week_Mon,day_of_week_Sat,day_of_week_Sun,day_of_week_Thu,day_of_week_Tue,day_of_week_Wed
0,-0.589687,-0.278247,0.55845,0.333029,1.665019,5523,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.104295,0.251256,0.55845,0.333029,-0.684474,1586,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.404979,0.320842,0.672284,0.333029,-0.684474,545,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.481621,0.519269,-1.743217,0.333029,-0.684474,5553,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.183513,-0.709351,0.330052,0.333029,2.202619,6321,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,-0.318782,-0.73164,2.197527,0.333029,1.665019,5764,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,-0.675884,-0.730009,0.672284,0.333029,1.587583,3030,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,-1.365462,0.693777,-1.410213,0.333029,-0.684474,1170,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,1.318965,0.181671,-0.451757,0.333029,1.337501,6098,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.777154,1.449976,-1.127251,0.269818,-0.684474,2329,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Split data to taining datset and test dataset

In [36]:
def data_splitting(data, label, test_size=0.2):
    X = data.drop(columns=label)
    y = data[label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    print("Training set size:", X_train.shape[0])
    print("Test set size:", X_test.shape[0])
    
    return X_train, X_test, y_train, y_test



In [37]:
X_train_taxi, X_test_taxi, y_train_taxi, y_test_taxi = data_splitting(taxi_data, 'taxi_count', 0.2)

Training set size: 6412
Test set size: 1604


In [38]:
X_train_citybike, X_test_citybike, y_train_citybike, y_test_citybike = data_splitting(citybike_data, 'citybike_count', 0.2)

Training set size: 6412
Test set size: 1604


## Linear regression model

## Random forest model