## DASK_ML

In [1]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LinearRegression
from dask_ml.wrappers import Incremental
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

import xgboost as xgb
from dask.distributed import Client
from dask.distributed import LocalCluster
import pandas as pd
import joblib
import numpy as np
import time
from dask.delayed import delayed
from itertools import product
import matplotlib.pyplot as plt

In [2]:
# Initialize Dask client
cluster = LocalCluster(n_workers=64, threads_per_worker=2, memory_limit="2GB")
client = Client(cluster)
# print client information
print(client)

<Client: 'tcp://127.0.0.1:39497' processes=64 threads=128, memory=119.21 GiB>


In [3]:
# Load data
files_to_use = "parquet"
# files_to_use = "hdf5" # does not work

if files_to_use == "parquet":
    start_time = time.time()
    data = dd.read_parquet('/d/hpc/projects/FRI/bigdata/students/dp8949/parquet_data/*.parquet')
    end_time = time.time()

elif files_to_use == "hdf5":
    start_time = time.time()
    data = dd.read_hdf('/d/hpc/projects/FRI/bigdata/students/dp8949/hdf5_data/*.h5', '/*')
    end_time = time.time()
else:
    print("No data files found")
    exit(1)

with open('results_comparison.txt', 'a') as f:
    f.write("-"*50 + "\n")
    f.write(f"Data loaded from {files_to_use} files\n")
    f.write(f"Data shape: {data.shape}\n")
    f.write("-"*50 + "\n\n")
    f.write(f"{'Task':<35}{'Time':<10}\n")
    f.write(f"{'Data reading':<35}{(end_time - start_time):<10.2f}\n")
    

# print all columns and their data types
print(data.dtypes)

Summons Number                                 int64
Plate ID                             string[pyarrow]
Registration State                   string[pyarrow]
Plate Type                           string[pyarrow]
Issue Date                           string[pyarrow]
Violation Code                                 int64
Vehicle Body Type                    string[pyarrow]
Vehicle Make                         string[pyarrow]
Issuing Agency                       string[pyarrow]
Street Code1                                   int64
Street Code2                                   int64
Street Code3                                   int64
Vehicle Expiration Date              string[pyarrow]
Violation Location                           float64
Violation Precinct                             int64
Issuer Precinct                                int64
Issuer Code                                    int64
Issuer Command                       string[pyarrow]
Issuer Squad                         string[py

### Data Cleaning

In [4]:
def data_cleaning(data):
    # keep only the columns we need
    columns_to_keep = [
        'Violation County',
        'Issue Date',
        'Violation Time',]

    data = data[columns_to_keep]

    # rename columns to remove spaces and make them lowercase
    data = data.rename(columns={
        'Violation County': 'violation_county',
        'Issue Date': 'issue_date',
        'Violation Time': 'violation_time'
    })

    # fix the county names
    remap_county_dict = {
        'K' : 'Brooklyn',
        'Q' : 'Queens',
        'NY': 'Manhattan',
        'QN': 'Queens',
        'BK': 'Brooklyn',
        'R' : 'Staten Island',
        'BX': 'Bronx',
        'ST': 'Staten Island',
        'MN': 'Manhattan',
        'KINGS': 'Brooklyn',
        'QNS': 'Queens',
        'BRONX': 'Bronx'
    }
    data['violation_county'] = data['violation_county'].map(remap_county_dict, meta=('violation_county', 'category')).astype('category')

    # convert the Issue Date to a datetime object
    data['issue_date'] = dd.to_datetime(data['issue_date'], format='%m/%d/%Y', errors='coerce')

    # Remove 'A' and 'P' from the end of the time, add ' AM' or ' PM' accordingly
    data['violation_time'] = data['violation_time'].str.slice(stop=-1) + ' ' + data['violation_time'].str.slice(start=-1).replace({'A': 'AM', 'P': 'PM'})

    # Convert the Violation Time to a datetime object
    data['violation_time'] = dd.to_datetime(data['violation_time'], format='%I%M %p', errors='coerce')

    # create a new column for the day of the week the violation was issued
    data['violation_day_week'] = data['issue_date'].dt.dayofweek

    # create a new column for the day of the month the violation was issued
    data['violation_day_month'] = data['issue_date'].dt.day

    # create a new column for the month the violation was issued
    data['violation_month'] = data['issue_date'].dt.month

    # create a new column for the year the violation was issued
    data['violation_year'] = data['issue_date'].dt.year

    # keep only rows with valid year (2013-2024)
    data = data[(data['violation_year'] >= 2013) & (data['violation_year'] <= 2024)]

    # keep only rows with valid month (1-12)
    data = data[(data['violation_month'] >= 1) & (data['violation_month'] <= 12)]

    # drop all rows with missing values
    data = data.dropna()

    # create a new column for the hour of the day the violation was issued
    data['violation_hour'] = data['violation_time'].dt.hour.astype('int32')

    # drop the Issue Date and Violation Time columns
    data = data.drop(columns=['violation_time'])
    
    return data

start_time = time.time()
data = data_cleaning(data)

# save the time to results_comparison.txt
with open('results_comparison.txt', 'a') as f:
    f.write(f"{'Data Cleaning':<35}{time.time() - start_time:<10.2f}\n")

In [5]:
#print type of columns
print(data.dtypes)

violation_county             category
issue_date             datetime64[ns]
violation_day_week              int32
violation_day_month             int32
violation_month                 int32
violation_year                  int32
violation_hour                  int32
dtype: object


In [6]:
# pick only one county (if you want to, otherwise set to "all")

# county = 'Manhattan'
# county = 'Queens'
# county = 'Brooklyn'
# county = 'Bronx'
# county = 'Staten Island'
county = 'all'

In [7]:
if county != 'all':
    data = data[data['violation_county'] == county]
    # drop the Violation County column
    data = data.drop(columns=['violation_county'])
else:
    # drop the Violation County column
    data = data.drop(columns=['violation_county'])

In [8]:
# reduce the dataset to an hourly level - count the number of violations per hour
data = data.groupby(['violation_year', 'violation_month', 'violation_day_month', 'violation_day_week', 'violation_hour']).size().reset_index()
data = data.rename(columns={0: 'violation_count'})

In [9]:
# make new column of format ='%Y-%m-%d %H:%M:%S' for the datetime (for joining with weather data)
data['datetime'] = data['violation_year'].astype(int).astype(str) + '-' + \
                          data['violation_month'].astype(int).astype(str).str.zfill(2) + '-' + \
                          data['violation_day_month'].astype(int).astype(str).str.zfill(2) + ' ' + \
                          data['violation_hour'].astype(int).astype(str).str.zfill(2) + ':00:00'
# data['datetime'] = dd.to_datetime(data['datetime'], format='%Y-%m-%d %H:%M:%S')

In [10]:
# Group the data by month and count the number of violations
# monthly_violations = data.groupby(data['datetime'].dt.to_period('M')).size()

# Plot the monthly violations
# monthly_violations.compute().plot()

In [11]:
# from the data take only the rows that have the datetime between 2013-01-01 and 2015-03-31
# data = data[(data['datetime'] >= '2013-01-01') & (data['datetime'] <= '2015-03-31')]

In [12]:
print(data.head())

   violation_year  violation_month  violation_day_month  violation_day_week  \
0          2013.0              1.0                  1.0                 1.0   
1          2013.0              1.0                  1.0                 1.0   
2          2013.0              1.0                  1.0                 1.0   
3          2013.0              1.0                  1.0                 1.0   
4          2013.0              1.0                  1.0                 1.0   

   violation_hour  violation_count             datetime  
0               0                1  2013-01-01 00:00:00  
1               1                1  2013-01-01 01:00:00  
2               2                2  2013-01-01 02:00:00  
3               3                1  2013-01-01 03:00:00  
4               4                2  2013-01-01 04:00:00  


## DATA AUGMENTATION

In [13]:
# Define the file path pattern to read all CSV files (adjust the path as needed)
file_pattern = 'nycweather_pesjak/nyc_w_*.csv'

# Specify the data types for each column
dtypes = {
    'datetime': 'str',  # 'datetime' will be parsed separately
    'temp': 'float64',
    'feelslike': 'float64',
    'dew': 'float64',
    'humidity': 'float64',
    'precip': 'float64',
    'precipprob': 'float64',
    'preciptype': 'object',  # String data type
    'snow': 'float64',
    'snowdepth': 'float64',
    'windgust': 'float64',
    'windspeed': 'float64',
    'winddir': 'float64',
    'sealevelpressure': 'float64',
    'cloudcover': 'float64',
    'visibility': 'float64',
    'solarradiation': 'float64',
    'solarenergy': 'float64',
    'uvindex': 'float64',
    'severerisk': 'float64',
    'conditions': 'object',  # String data type
    'icon': 'object',  # String data type
    'stations': 'object'  # String data type
}

# Read all CSV files into a single Dask DataFrame with specified dtypes
weather_df = dd.read_csv(file_pattern, dtype=dtypes)

# Convert the 'datetime' column to datetime type
weather_df['datetime'] = dd.to_datetime(weather_df['datetime'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')

# Drop all string columns except 'datetime'
columns_to_keep = ['datetime'] + [col for col in weather_df.columns if weather_df[col].dtype in ['float64', 'int64']]
weather_df = weather_df[columns_to_keep]

In [14]:
def add_weather_data(data, weather_df):
    # Step 4: Merge the complete data with weather data
    augmented_data = dd.merge(data, weather_df, on='datetime', how='left')
    
    # Step 5: drop missing data and severerisk and windgust columns
    augmented_data = augmented_data.drop(columns=['severerisk', 'windgust'])
    augmented_data = augmented_data.dropna()

    return augmented_data

In [15]:
def fill_in_hours(data):
    # Step 1: Create a complete range of dates and hours directly with hourly frequency
    min_date = data['datetime'].min().compute()
    max_date = data['datetime'].max().compute()
    print(f"Taking data from {min_date} to {max_date}")
    all_date_hours = pd.date_range(start=min_date, end=max_date, freq='h').to_frame(index=False, name='datetime')

    # Extract year, month, day_of_month, and day_of_week directly from the datetime
    all_date_hours['violation_year'] = all_date_hours['datetime'].dt.year
    all_date_hours['violation_month'] = all_date_hours['datetime'].dt.month
    all_date_hours['violation_day_month'] = all_date_hours['datetime'].dt.day
    all_date_hours['violation_day_week'] = all_date_hours['datetime'].dt.dayofweek
    all_date_hours['violation_hour'] = all_date_hours['datetime'].dt.hour

    # Step 2: Convert the comprehensive Pandas DataFrame to a Dask DataFrame
    all_date_hours_dd = dd.from_pandas(all_date_hours, npartitions=10)

    # Step 3: Merge with existing data using Dask's merge function
    complete_data = dd.merge(all_date_hours_dd, data, on=['violation_year', 'violation_month', 'violation_day_month', 'violation_day_week', 'violation_hour'], how='outer')
    # complete_data = dd.merge(all_date_hours_dd, data, on=['violation_year', 'violation_month', 'violation_day_month', 'violation_day_week', 'violation_hour', 'ounty'], how='outer')

    # Fill missing values with 0
    complete_data['violation_count'] = complete_data['violation_count'].fillna(0).astype(int)
    
    # drop datetime_y column and rename datetime_x to datetime
    complete_data = complete_data.drop(columns=['datetime_y'])
    complete_data = complete_data.rename(columns={'datetime_x': 'datetime'})
    
    return complete_data

In [16]:
# fill in the missing hours
start_time = time.time()
data = fill_in_hours(data)
# save the time to results_comparison.txt
with open('results_comparison.txt', 'a') as f:
    f.write(f"{'Fill in missing hours':<35}{time.time() - start_time:<10.2f}\n")

# add weather data
start_time = time.time()
data = add_weather_data(data, weather_df)
# save the time to results_comparison.txt
with open('results_comparison.txt', 'a') as f:
    f.write(f"{'Add weather data':<35}{time.time() - start_time:<10.2f}\n")



# print the data types
print(data.dtypes)

Taking data from 2013-01-01 00:00:00 to 2024-12-31 21:00:00
datetime               datetime64[ns]
violation_year                  int32
violation_month                 int32
violation_day_month             int32
violation_day_week              int32
violation_hour                  int32
violation_count                 int64
temp                          float64
feelslike                     float64
dew                           float64
humidity                      float64
precip                        float64
precipprob                    float64
snow                          float64
snowdepth                     float64
windspeed                     float64
winddir                       float64
sealevelpressure              float64
cloudcover                    float64
visibility                    float64
solarradiation                float64
solarenergy                   float64
uvindex                       float64
dtype: object


## Split

In [17]:
def train_test_split(data):
    start_time = time.time()
    
    # Temporal train-test split
    split_date = '2022-12-31 23:00:00'
    train_data = data[data['datetime'] < split_date]
    test_data = data[data['datetime'] >= split_date]
    
    print(f"Temporal train-test split done in {time.time() - start_time} seconds ({time.time() - start_time})")
    new_start_time = time.time()

    # Combine dropping of columns
    columns_to_drop = ['violation_count', 'datetime']
    X_train = train_data.drop(columns=columns_to_drop)
    y_train = train_data['violation_count']
    X_test = test_data.drop(columns=columns_to_drop)
    y_test = test_data['violation_count']
    
    print(f"Dropping columns done in {time.time() - new_start_time} seconds ({time.time() - start_time})")
    new_start_time = time.time()

    # Persist the training and test sets to avoid recomputation
    X_train, y_train, X_test, y_test = client.persist([X_train, y_train, X_test, y_test])
    
    print(f"Persisting data done in {time.time() - new_start_time} seconds ({time.time() - start_time})")
    new_start_time = time.time()

    # Identify and drop constant columns in a single pass
    constant_columns = [col for col in X_train.columns if X_train[col].nunique().compute() <= 1]
    X_train = X_train.drop(columns=constant_columns)
    X_test = X_test.drop(columns=constant_columns)
    
    print(f"Dropping constant columns done in {time.time() - new_start_time} seconds ({time.time() - start_time})")
    new_start_time = time.time()
    # print how many columns were dropped
    print(f"    Dropped {len(constant_columns)} constant columns")

    # Convert Dask DataFrame to Dask Array
    X_train_array = X_train.to_dask_array(lengths=True)
    y_train_array = y_train.to_dask_array(lengths=True)
    X_test_array = X_test.to_dask_array(lengths=True)
    y_test_array = y_test.to_dask_array(lengths=True)
    
    print(f"Converting to Dask Array done in {time.time() - new_start_time} seconds ({time.time() - start_time})")
    new_start_time = time.time()

    # Standardize the train data and apply the same transformation to the test data
    mean = X_train_array.mean(axis=0)
    std = X_train_array.std(axis=0)
    X_train_array = (X_train_array - mean) / std
    X_test_array = (X_test_array - mean) / std
    
    print(f"Standardizing data done in {time.time() - new_start_time} seconds ({time.time() - start_time})")

    return X_train_array, y_train_array, X_test_array, y_test_array

In [18]:
# make the train-test split
start_time = time.time()
X_train, y_train, X_test, y_test = train_test_split(data)
# write to results_comparison.txt
with open('results_comparison.txt', 'a') as f:
    f.write(f"{'Train-test split':<35}{time.time() - start_time:<10.2f}\n")

Temporal train-test split done in 0.009440183639526367 seconds (0.009447813034057617)
Dropping columns done in 0.013251543045043945 seconds (0.02366471290588379)
Persisting data done in 0.04300737380981445 seconds (0.0666964054107666)
Dropping constant columns done in 42.125470876693726 seconds (42.19220018386841)
    Dropped 0 constant columns
Converting to Dask Array done in 0.21129226684570312 seconds (42.40859794616699)
Standardizing data done in 0.011327981948852539 seconds (42.41996669769287)


### Linear Regression - Dask

In [19]:
lr = LinearRegression()

start_time = time.time()
lr.fit(X_train, y_train)
end_time = time.time()
lr_fit_time = end_time - start_time

start_time = time.time()
y_pred_lr = lr.predict(X_test)
end_time = time.time()
lr_predict_time = end_time - start_time

### XGBoost

In [20]:
# Model 2: XGBoost
dtrain = xgb.DMatrix(X_train.compute(), label=y_train.compute())
dtest = xgb.DMatrix(X_test.compute(), label=y_test.compute())
params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse'
}

start_time = time.time()
bst = xgb.train(params, dtrain, num_boost_round=100)
end_time = time.time()
xgb_fit_time = end_time - start_time

start_time = time.time()
y_pred_xgb = bst.predict(dtest)
end_time = time.time()
xgb_predict_time = end_time - start_time

### SGD Regression

In [21]:
# Batch size for training
batch_size = 100

# Model 1: SGDRegressor
sgd = SGDRegressor()
incremental_sgd = Incremental(sgd)

# Fit the model in batches
start_time = time.time()
for i in range(0, len(X_train), batch_size):
    end = i + batch_size
    X_batch = X_train[i:end].compute()
    y_batch = y_train[i:end].compute()
    incremental_sgd.partial_fit(X_batch, y_batch)
end_time = time.time()
sgd_fit_time = end_time - start_time

start_time = time.time()
y_pred_sgd = incremental_sgd.predict(X_test.compute())
end_time = time.time()
sgd_predict_time = end_time - start_time

In [22]:
# write all times to results_comparison.txt
with open('results_comparison.txt', 'a') as f:
    f.write(f"{'Linear Regression fit':<35}{lr_fit_time:<10.2f}\n")
    f.write(f"{'Linear Regression predict':<35}{lr_predict_time:<10.2f}\n")
    f.write(f"{'XGBoost fit':<35}{xgb_fit_time:<10.2f}\n")
    f.write(f"{'XGBoost predict':<35}{xgb_predict_time:<10.2f}\n")
    f.write(f"{'SGD fit':<35}{sgd_fit_time:<10.2f}\n")
    f.write(f"{'SGD predict':<35}{sgd_predict_time:<10.2f}\n")

## Baseline

In [23]:
# Preditc the mean for all days
y_pred_mean = np.full(len(y_test), y_train.mean().compute())

In [24]:
# Evaluate models
def evaluate_model(y_test, y_pred):
    return {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': root_mean_squared_error(y_test, y_pred)
    }

In [25]:
results_baseline = evaluate_model(y_test.compute(), y_pred_mean)
print('Baseline:', results_baseline)

Baseline: {'MAE': 293.3383467851435, 'RMSE': 868.1107518204351}


In [26]:
# Evaluate models
def evaluate_model(y_test, y_pred):
    return {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': root_mean_squared_error(y_test, y_pred)
    }

start_time = time.time()
results_lr = evaluate_model(y_test.compute(), y_pred_lr.compute())
time_lr = time.time() - start_time
start_time = time.time()
results_xgb = evaluate_model(y_test.compute(), y_pred_xgb)
time_xgb = time.time() - start_time
start_time = time.time()
results_sgd = evaluate_model(y_test.compute(), y_pred_sgd)
time_sgd = time.time() - start_time

with open('results_comparison.txt', 'a') as f:
    f.write(f"{'Linear regression evaluation':<35} {time_lr:<10.2f}\n")
    f.write(f"{'XGBoost evaluation':<35} {time_xgb:<10.2f}\n")
    f.write(f"{'SGD regression evaluation':<35} {time_sgd:<10.2f}\n")

In [27]:
# Print results
print(f"{'Method':<20}{'MAE':<25}{'RMSE':<25}{'fit time [ms]':<15}{'predict time [ms]':<15}")
print(f"{f'Linear Regression':<20}{results_lr['MAE']:<25.2f}{results_lr['RMSE']:<25.2f}{lr_fit_time:<15.2f}{lr_predict_time:<15.2f}")
print(f"{f'XGBoost':<20}{results_xgb['MAE']:<25.2f}{results_xgb['RMSE']:<25.2f}{xgb_fit_time:<15.2f}{xgb_predict_time:<15.2f}")
print(f"{f'SGDRegressor':<20}{results_sgd['MAE']:<25.2f}{results_sgd['RMSE']:<25.2f}{sgd_fit_time:<15.2f}{sgd_predict_time:<15.2f}")
print("_"*100)
print(f"{'Baseline':<20}{results_baseline['MAE']:<25.2f}{results_baseline['RMSE']:<25.2f}{0:<15}{0:<15}")

Method              MAE                      RMSE                     fit time [ms]  predict time [ms]
Linear Regression   332.42                   859.46                   53.20          0.00           
XGBoost             351.87                   864.64                   1.10           0.03           
SGDRegressor        337.68                   857.57                   169.48         0.17           
____________________________________________________________________________________________________
Baseline            293.34                   868.11                   0              0              


## Pipeline for different counties

In [28]:
# Define the file path pattern to read all CSV files (adjust the path as needed)
file_pattern = 'nycweather_pesjak/nyc_w_*.csv'

# Specify the data types for each column
dtypes = {
    'datetime': 'str',  # 'datetime' will be parsed separately
    'temp': 'float64',
    'feelslike': 'float64',
    'dew': 'float64',
    'humidity': 'float64',
    'precip': 'float64',
    'precipprob': 'float64',
    'preciptype': 'object',  # String data type
    'snow': 'float64',
    'snowdepth': 'float64',
    'windgust': 'float64',
    'windspeed': 'float64',
    'winddir': 'float64',
    'sealevelpressure': 'float64',
    'cloudcover': 'float64',
    'visibility': 'float64',
    'solarradiation': 'float64',
    'solarenergy': 'float64',
    'uvindex': 'float64',
    'severerisk': 'float64',
    'conditions': 'object',  # String data type
    'icon': 'object',  # String data type
    'stations': 'object'  # String data type
}

# Read all CSV files into a single Dask DataFrame with specified dtypes
weather_df = dd.read_csv(file_pattern, dtype=dtypes)

# Convert the 'datetime' column to datetime type
weather_df['datetime'] = dd.to_datetime(weather_df['datetime'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')

# Drop all string columns except 'datetime'
columns_to_keep = ['datetime'] + [col for col in weather_df.columns if weather_df[col].dtype in ['float64', 'int64']]
weather_df = weather_df[columns_to_keep]

In [29]:
additional_data_paths = ["holiday_counts_by_hour", "landmark_counts_by_hour", "school_counts_by_hour", "business_counts_by_hour", "event_counts_by_hour"]

for path in additional_data_paths:
    # Load the data
    additional_data = dd.read_csv(path+".csv")
    
    # convert the datetime column to datetime type
    additional_data['datetime'] = dd.to_datetime(additional_data['datetime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

    # Merge the data
    weather_df = dd.merge(weather_df, additional_data, on='datetime', how='left')

In [30]:
for column in weather_df.columns:
    print(f"{column:<40} {weather_df[column].dtype}")

datetime                                 datetime64[ns]
temp                                     float64
feelslike                                float64
dew                                      float64
humidity                                 float64
precip                                   float64
precipprob                               float64
snow                                     float64
snowdepth                                float64
windgust                                 float64
windspeed                                float64
winddir                                  float64
sealevelpressure                         float64
cloudcover                               float64
visibility                               float64
solarradiation                           float64
solarenergy                              float64
uvindex                                  float64
severerisk                               float64
national_holiday                         int64
religious_holid

In [31]:
def add_corona_time(data):
    start_date = '2020-01-30 00:00:00'
    end_date = '2023-05-5 23:00:00'

    # Convert 'datetime' column to datetime using map_partitions
    data['datetime'] = data.map_partitions(lambda df: pd.to_datetime(df['datetime']), meta=('datetime', 'datetime64[ns]'))

    # Apply the condition to set 'corona_time' across partitions
    data['corona_time'] = data.map_partitions(
        lambda df: ((df['datetime'] >= start_date) & (df['datetime'] <= end_date)).astype(int),
        meta=('corona_time', 'int')
    )

    return data

In [32]:
def lr_func(X_train, y_train, X_test, y_test):
    lr = LinearRegression()

    start_time = time.time()
    lr.fit(X_train, y_train)
    end_time = time.time()
    lr_fit_time = end_time - start_time

    start_time = time.time()
    y_pred_lr = lr.predict(X_test)
    end_time = time.time()
    lr_predict_time = end_time - start_time

    results_lr = evaluate_model(y_test.compute(), y_pred_lr.compute())
    
    results_dict = {
        'MAE': results_lr['MAE'],
        'RMSE': results_lr['RMSE'],
        'fit_time': lr_fit_time,
        'predict_time': lr_predict_time
    }
    
    return results_dict
    
def xgb_func(X_train, y_train, X_test, y_test):
    dtrain = xgb.DMatrix(X_train.compute(), label=y_train.compute())
    dtest = xgb.DMatrix(X_test.compute(), label=y_test.compute())
    params = {
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'eval_metric': 'rmse'
    }

    start_time = time.time()
    bst = xgb.train(params, dtrain, num_boost_round=100)
    end_time = time.time()
    xgb_fit_time = end_time - start_time

    start_time = time.time()
    y_pred_xgb = bst.predict(dtest)
    end_time = time.time()
    xgb_predict_time = end_time - start_time

    results_xgb = evaluate_model(y_test.compute(), y_pred_xgb)
    
    results_dict = {
        'MAE': results_xgb['MAE'],
        'RMSE': results_xgb['RMSE'],
        'fit_time': xgb_fit_time,
        'predict_time': xgb_predict_time
    }
    
    return results_dict

def sgd_func(X_train, y_train, X_test, y_test):
    batch_size = 100
    sgd = SGDRegressor()
    incremental_sgd = Incremental(sgd)

    # Fit the model in batches
    start_time = time.time()
    for i in range(0, len(X_train), batch_size):
        end = i + batch_size
        X_batch = X_train[i:end].compute()
        y_batch = y_train[i:end].compute()
        incremental_sgd.partial_fit(X_batch, y_batch)
    end_time = time.time()
    sgd_fit_time = end_time - start_time

    start_time = time.time()
    y_pred_sgd = incremental_sgd.predict(X_test.compute())
    end_time = time.time()
    sgd_predict_time = end_time - start_time

    results_sgd = evaluate_model(y_test.compute(), y_pred_sgd)
    
    results_dict = {
        'MAE': results_sgd['MAE'],
        'RMSE': results_sgd['RMSE'],
        'fit_time': sgd_fit_time,
        'predict_time': sgd_predict_time
    }
    
    return results_dict

def mean_func(X_train, y_train, X_test, y_test):
    y_pred_mean = np.full(len(y_test), y_train.mean().compute())
    results_mean = evaluate_model(y_test.compute(), y_pred_mean)
    
    results_dict = {
        'MAE': results_mean['MAE'],
        'RMSE': results_mean['RMSE'],
        'fit_time': 0,
        'predict_time': 0
    }
    
    return results_dict

In [33]:
import datetime

def print_results(results, county):
    print("_"*90)
    print(f"{county}")
    print(f"{'Method':<20}{'MAE':<15}{'RMSE':<15}{'fit time [ms]':<15}{'predict time [ms]':<15}")
    for key, value in results.items():
        print(f"{key:<20}{value['MAE']:<15.2f}{value['RMSE']:<15.2f}{value['fit_time']:<15.2f}{value['predict_time']:<15.2f}")
    print("_"*90)
    
def save_results(results, county, save_to="results.txt"):
    with open(save_to, 'a') as f:
        f.write("_"*90 + "\n")
        f.write(f"{county}\n")
        f.write(f"{'Method':<20}{'MAE':<15}{'RMSE':<15}{'fit time [ms]':<15}{'predict time [ms]':<15}\n")
        for key, value in results.items():
            f.write(f"{key:<20}{value['MAE']:<15.2f}{value['RMSE']:<15.2f}{value['fit_time']:<15.2f}{value['predict_time']:<15.2f}\n")
        f.write("_"*90 + "\n")
        
def format_global_time(start_time):
    elapsed_time = time.time() - start_time
    return str(datetime.timedelta(seconds=elapsed_time))

def format_internal_time(start_time):
    elapsed_time = time.time() - start_time
    if elapsed_time < 1:
        return str(round(elapsed_time * 1000, 2)) + " ms"
    elif elapsed_time < 60:
        return str(round(elapsed_time, 2)) + " s"
    elif elapsed_time < 3600:
        return str(elapsed_time // 60) + " min and " + str(round(elapsed_time % 60, 2)) + " s"
    else:
        return str(elapsed_time // 3600) + " h and " + str(round((elapsed_time % 3600) / 60, 2)) + " min"
    

In [34]:
global_start_time = time.time()

data = dd.read_parquet('/d/hpc/projects/FRI/bigdata/students/dp8949/parquet_data/*.parquet')

clean_data = data_cleaning(data).persist()

counties = ["Manhattan", "Queens", "Brooklyn", "Bronx", "Staten Island", "all"]
# counties = ["all"]
methods = {"Linear Regression": lr_func, "XGBoost": xgb_func, "SGDRegressor": sgd_func, "Baseline": mean_func}
# methods = {"XGBoost": xgb_func, "Baseline": mean_func}

for county in counties:
    internal_start_time = time.time()
    print(f"\n({format_global_time(global_start_time)}) Processing {county}")
    county_data = data_cleaning(data)
    
    print(f"({format_global_time(global_start_time)})  Data cleaning done in {format_internal_time(internal_start_time)}")
    internal_start_time = time.time()
    
    if county != 'all':
        county_data = county_data[county_data['violation_county'] == county]
        # drop the Violation County column
        county_data = county_data.drop(columns=['violation_county'])
    else:
        # drop the Violation County column
        county_data = county_data.drop(columns=['violation_county'])

    county_data = county_data.groupby(['violation_year', 'violation_month', 'violation_day_month', 'violation_day_week', 'violation_hour']).size().reset_index()
    county_data = county_data.rename(columns={0: 'violation_count'})

    county_data['datetime'] = county_data['violation_year'].astype(int).astype(str) + '-' + \
                              county_data['violation_month'].astype(int).astype(str).str.zfill(2) + '-' + \
                              county_data['violation_day_month'].astype(int).astype(str).str.zfill(2) + ' ' + \
                              county_data['violation_hour'].astype(int).astype(str).str.zfill(2) + ':00:00'
    county_data = county_data.rename(columns={0: 'violation_count'})  

    # county_data['datetime'] = county_data['violation_year'].astype(str) + '-' + county_data['violation_month'].astype(str).str.zfill(2) + '-' + county_data['violation_day_month'].astype(str).str.zfill(2) + ' ' + county_data['violation_hour'].astype(str).str.zfill(2) + ':00:00'
    county_data['datetime'] = dd.to_datetime(county_data['datetime'], format='%Y-%m-%d %H:%M:%S')
    
    # county_data = county_data[(county_data['datetime'] >= '2013-01-01') & (county_data['datetime'] <= '2015-03-31')]
    
    print(f"({format_global_time(global_start_time)})  Data preparation done in {format_internal_time(internal_start_time)}")
    internal_start_time = time.time()
    
    # fill in the missing hours
    county_data = fill_in_hours(county_data)
    print(f"({format_global_time(global_start_time)})  Filling in hours done in {format_internal_time(internal_start_time)}")
    internal_start_time = time.time()
    
    # add weather data
    county_data = add_weather_data(county_data, weather_df)
    print(f"({format_global_time(global_start_time)})  Adding weather data done in {format_internal_time(internal_start_time)}")
    internal_start_time = time.time()
    
    # make the train-test split
    X_train, y_train, X_test, y_test = train_test_split(county_data)
    print(f"({format_global_time(global_start_time)})  Train-test split done in {format_internal_time(internal_start_time)}")
    internal_start_time = time.time()

    # persist the data
    X_train, y_train, X_test, y_test = client.persist([X_train, y_train, X_test, y_test])
    
    # evaluate the models
    results = {}
    for method in methods:
        results[method] = methods[method](X_train, y_train, X_test, y_test)
        print(f"({format_global_time(global_start_time)})  {method} done in {format_internal_time(internal_start_time)}")
        internal_start_time = time.time()
        
    
    # clear all data
    del county_data
    del X_train
    del y_train
    del X_test
    del y_test
    
    print_results(results, county)
    save_results(results, county)    


(0:00:00.304759) Processing Manhattan
(0:00:00.434199)  Data cleaning done in 129.46 ms
(0:00:00.518110)  Data preparation done in 83.87 ms
Taking data from 2013-01-01 02:00:00 to 2024-12-26 06:00:00
(0:01:39.292950)  Filling in hours done in 1.0 min and 38.77 s
(0:01:39.403507)  Adding weather data done in 110.52 ms
Temporal train-test split done in 0.0073490142822265625 seconds (0.007355928421020508)
Dropping columns done in 0.036359548568725586 seconds (0.04374217987060547)
Persisting data done in 0.0928957462310791 seconds (0.13679099082946777)
Dropping constant columns done in 53.98084831237793 seconds (54.117680072784424)
    Dropped 12 constant columns
Converting to Dask Array done in 0.43853759765625 seconds (54.556354999542236)
Standardizing data done in 0.008935213088989258 seconds (54.56557250022888)
(0:02:33.971918)  Train-test split done in 54.57 s
(0:04:46.072552)  Linear Regression done in 2.0 min and 12.1 s
(0:04:50.555971)  XGBoost done in 4.48 s
(0:05:42.333490)  SGD

In [35]:
print(county_data.head())

NameError: name 'county_data' is not defined

In [None]:
# client.close()