# MAST30034_Applied Data Science_Project1

## Import Libraries

In [12]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
from geodatasets import get_path
import pandas as pd
import geopandas as gpd
from pyspark.sql.functions import date_format, hour, dayofweek
import seaborn as sns
from scipy.stats import chi2_contingency
from pyspark.sql.functions import count as spark_count
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline


In [6]:
# Create a spark session
spark = (
SparkSession.builder.appName("ADS project 1")
.config("spark.sql.repl.eagerEval.enabled", True)
.config("spark.driver.memory","6G")
.config("spark.executor.memory","6G")
.config("spark.sql.parquet.cacheMetadata", "true")
.getOrCreate()
)

## Read the data

In [35]:
taxi_data = spark.read.parquet('../data/after_EDA_data_Engineering/taxi')
citybike_data = spark.read.parquet('../data/after_EDA_data_Engineering/citybike')

In [36]:
taxi_data.show()

+----+--------------------+--------------------+----------+-------------------+--------------------+--------------------+-------------------+-----------+----------+
|hour|                temp|            humidity|preciptype|           windgust|          visibility|         solarenergy|               icon|day_of_week|taxi_count|
+----+--------------------+--------------------+----------+-------------------+--------------------+--------------------+-------------------+-----------+----------+
|  11| -0.5896870737724257|-0.27824657171602496|  not rain| 0.5584503060838967|  0.3330290163749476|  1.6650191425305423|          clear-day|        Fri|      5523|
|   6| -2.1042952766705487|  0.2512560090587267|  not rain| 0.5584503060838967|  0.3330290163749476| -0.6844743385380112|        clear-night|        Wed|      1586|
|   5|  -0.404978762790395|  0.3208415127216528|  not rain| 0.6722841472663952|  0.3330290163749476| -0.6844743385380112|partly-cloudy-night|        Sun|       545|
|  23|  0.

In [9]:
citybike_data.show()

+----+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+------------------+--------------+
|hour|                temp|                 dew|            humidity|           windgust|         cloudcover|         visibility|         solarenergy|             uvindex|        severerisk|citybike_count|
+----+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+--------------------+--------------------+------------------+--------------+
|   6| -2.1042952766705585| -1.6353515748880314|  0.2512560090587033| 0.5584503060839103|-0.8982892402394185| 0.3330290163749645| -0.6844743385380124| -0.5939413742546337|-0.201801348869187|          2153|
|  11| -0.5896870737724342| -0.5726921086707277| -0.2782465717160488| 0.5584503060839103|-0.8892822712333712| 0.3330290163749645|  1.6650191425305345|  1.4073339750623697|-0.20

## One hot encoding

In [39]:
def encode_categorical_column(dataset, columns):    
    for column in columns:
        # Step 1: 将字符串列转化为索引列
        indexer = StringIndexer(inputCol=column, outputCol=column + "_index")
        dataset = indexer.fit(dataset).transform(dataset)
        # Step 2: 对索引列进行One-Hot编码
        encoder = OneHotEncoder(inputCol=column + "_index", outputCol=column + "_encoded")
        dataset = encoder.fit(dataset).transform(dataset)
            
        # Step 3: 移除原始列和索引列，只保留编码后的列
        dataset = dataset.drop(column, column + "_index")
    return dataset

In [41]:
taxi_data = encode_categorical_column(taxi_data, ['hour', 'icon', 'preciptype', 'day_of_week'])

In [42]:
citybike_data  =  encode_categorical_column(citybike_data, ['hour'])

## Split data to taining datset and test dataset

In [45]:
# the function use to split dataset
def data_splitting(data, label, test_size):
    train_data, test_data = data.randomSplit([1 - test_size, test_size], seed=42)
    X_train = train_data.drop(label)
    y_train = train_data.select(label)
    X_test = test_data.drop(label)
    y_test = test_data.select(label)
    
    print("Training set size:", X_train.count())
    print("Test set size:", X_test.count())

    return X_train, X_test, y_train, y_test


In [49]:
X_train_taxi, X_test_taxi, y_train_taxi, y_test_taxi = data_splitting(taxi_data, 'taxi_count', 0.2)

Training set size: 6484
Test set size: 1532


In [50]:
X_train_citybike, X_test_citybike, y_train_citybike, y_test_citybike = data_splitting(citybike_data, 'citybike_count', 0.2)

Training set size: 6484
Test set size: 1532


## Random forest model