In [2]:
!pip install dask

Collecting dask
  Downloading dask-2024.6.0-py3-none-any.whl.metadata (3.8 kB)
Collecting click>=8.1 (from dask)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting cloudpickle>=1.5.0 (from dask)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting fsspec>=2021.09.0 (from dask)
  Downloading fsspec-2024.6.0-py3-none-any.whl.metadata (11 kB)
Collecting partd>=1.2.0 (from dask)
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting pyyaml>=5.3.1 (from dask)
  Downloading PyYAML-6.0.1-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting toolz>=0.10.0 (from dask)
  Downloading toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Collecting importlib-metadata>=4.13.0 (from dask)
  Using cached importlib_metadata-7.1.0-py3-none-any.whl.metadata (4.7 kB)
Collecting zipp>=0.5 (from importlib-metadata>=4.13.0->dask)
  Downloading zipp-3.19.2-py3-none-any.whl.metadata (3.6 kB)
Collecting locket (from partd>=1.2.0->dask)
  Downloading loc

In [3]:
!pip install dask-ml

Collecting dask-ml
  Downloading dask_ml-2024.4.4-py3-none-any.whl.metadata (5.9 kB)
Collecting dask-glm>=0.2.0 (from dask-ml)
  Downloading dask_glm-0.3.2-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting distributed>=2.4.0 (from dask-ml)
  Downloading distributed-2024.6.0-py3-none-any.whl.metadata (3.4 kB)
Collecting multipledispatch>=0.4.9 (from dask-ml)
  Downloading multipledispatch-1.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting numba>=0.51.0 (from dask-ml)
  Downloading numba-0.60.0-cp311-cp311-win_amd64.whl.metadata (2.8 kB)
Collecting numpy>=1.20.0 (from dask-ml)
  Downloading numpy-2.0.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.9 kB ? eta -:--:--
     ---------------------------------------- 60.9/60.9 kB 1.1 MB/s eta 0:00:00
Collecting pandas>=0.24.2 (from dask-ml)
  Using cached pandas-2.2.2-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting 

In [9]:
!pip install xgboost    

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/99.8 MB 1.2 MB/s eta 0:01:24
   ---------------------------------------- 0.3/99.8 MB 2.5 MB/s eta 0:00:41
   ---------------------------------------- 0.6/99.8 MB 3.5 MB/s eta 0:00:29
   ---------------------------------------- 0.8/99.8 MB 4.1 MB/s eta 0:00:25
   ---------------------------------------- 1.2/99.8 MB 4.9 MB/s eta 0:00:21
    --------------------------------------- 1.6/99.8 MB 5.5 MB/s eta 0:00:18
    --------------------------------------- 2.1/99.8 MB 6.2 MB/s eta 0:00:16
   - -------------------------------------- 2.5/99.8 MB 6.7 MB/s eta 0:00:15
   - ----------------------

## DASK_ML

In [1]:
import dask.dataframe as dd
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LinearRegression
from dask_ml.wrappers import Incremental
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import xgboost as xgb
from dask.distributed import Client
import pandas as pd
import joblib
import numpy as np
import time

In [2]:
# Initialize Dask client
client = Client(n_workers=8, threads_per_worker=1, memory_limit='4GB')

In [3]:
# Load multiple Parquet files
data = dd.read_parquet('parquet/*.parquet')

# print all columns and their data types
print(data.dtypes)

Summons Number                                 int64
Plate ID                             string[pyarrow]
Registration State                   string[pyarrow]
Plate Type                           string[pyarrow]
Issue Date                           string[pyarrow]
Violation Code                                 int64
Vehicle Body Type                    string[pyarrow]
Vehicle Make                         string[pyarrow]
Issuing Agency                       string[pyarrow]
Street Code1                                   int64
Street Code2                                   int64
Street Code3                                   int64
Vehicle Expiration Date              string[pyarrow]
Violation Location                           float64
Violation Precinct                             int64
Issuer Precinct                                int64
Issuer Code                                    int64
Issuer Command                       string[pyarrow]
Issuer Squad                         string[py

### Data Cleaning

In [4]:
# keep only the columns we need
columns_to_keep = ['Summons Number',
    'Registration State',
    'Plate Type',
    'Issue Date',
    'Violation Time',
    'Violation Precinct', 
    'Issuer Precinct',
    'Issuer Code',
    'Violation Code',
    'Meter Number',
    'Violation County',
    'Issuer Squad',
    'Issuing Agency']

data = data[columns_to_keep]

# fix the county names
remap_county_dict = {
    'K' : 'Brooklyn',
    'Q' : 'Queens',
    'NY': 'Manhattan',
    'QN': 'Queens',
    'BK': 'Brooklyn',
    'R' : 'Staten Island',
    'BX': 'Bronx',
    'ST': 'Staten Island',
    'MN': 'Manhattan',
    'KINGS': 'Brooklyn',
    'QNS': 'Queens',
    'BRONX': 'Bronx'
}
data['Violation County'] = data['Violation County'].map(remap_county_dict).astype('category')

# convert the Issue Date to a datetime object
data['Issue Date'] = dd.to_datetime(data['Issue Date'], format='%m/%d/%Y', errors='coerce')

# Remove 'A' and 'P' from the end of the time, add ' AM' or ' PM' accordingly
data['Violation Time'] = data['Violation Time'].str.slice(stop=-1) + ' ' + data['Violation Time'].str.slice(start=-1).replace({'A': 'AM', 'P': 'PM'})

# Convert the Violation Time to a datetime object
data['Violation Time'] = dd.to_datetime(data['Violation Time'], format='%I%M %p', errors='coerce')

# create a new column for the hour of the day the violation was issued
data['Violation Hour'] = data['Violation Time'].dt.hour

# create a new column for the day of the week the violation was issued
data['Violation Day'] = data['Issue Date'].dt.dayofweek

# create a new column for the month the violation was issued
data['Violation Month'] = data['Issue Date'].dt.month

# create a new column for the year the violation was issued
data['Violation Year'] = data['Issue Date'].dt.year

# drop the Issue Date and Violation Time columns
data = data.drop(columns=['Violation Time'])

# keep only rows with valid year (2013-2024)
data = data[(data['Violation Year'] >= 2013) & (data['Violation Year'] <= 2024)]

# keep only rows with valid month (1-12)
data = data[(data['Violation Month'] >= 1) & (data['Violation Month'] <= 12)]

# drop all rows with missing values
data = data.dropna()

# convert the specified columns to categorical
data = data.categorize(columns=['Registration State', 'Plate Type', 'Violation County', 'Issuer Squad', 'Issuing Agency'])

# encode the categorical columns
data = dd.get_dummies(data, columns=['Registration State', 'Plate Type', 'Violation County', 'Issuer Squad', 'Issuing Agency'])

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('Violation County', 'object'))



In [5]:
# print the first 5 rows of the dataframe
print(data.head())

      Summons Number Issue Date  Violation Precinct  Issuer Precinct  \
1547      1360169945 2013-01-07                  76               76   
1624      1357040465 2013-01-01                 110              110   
1625      1360945556 2013-01-01                  25               25   
1626      1361634200 2013-01-01                  81               81   
1627      1359577245 2013-01-01                  78               78   

      Issuer Code  Violation Code Meter Number  Violation Hour  Violation Day  \
1547       949041              19            -             4.0              0   
1624       941244              46            -            10.0              1   
1625       918163              78            -             3.0              1   
1626       945628              46            -            13.0              1   
1627       928683              46            -            11.0              1   

      Violation Month  ...  Issuing Agency_K  Issuing Agency_M  \
1547          

In [6]:
# print the last 40 values in date column
print(data['Issue Date'].tail(40))

302358   2014-12-11
302373   2014-12-20
302374   2014-12-20
302377   2015-04-14
302382   2014-08-25
302383   2014-08-25
302392   2015-03-18
302396   2015-06-02
302401   2014-11-15
302405   2014-07-12
302420   2015-03-19
302426   2014-06-23
302429   2015-04-07
302436   2015-05-29
302441   2014-12-18
302448   2015-04-29
302450   2014-07-29
302451   2015-04-25
302453   2014-10-24
302454   2014-08-04
302461   2014-11-12
302478   2014-11-17
302486   2015-03-04
302499   2014-10-06
302502   2015-05-23
302503   2014-10-13
302505   2015-02-21
302507   2015-05-08
302513   2015-04-06
302515   2015-04-06
302518   2015-03-31
302529   2014-09-08
302530   2014-09-08
302544   2015-01-02
302545   2015-01-02
302561   2015-03-03
302566   2015-03-30
302579   2015-01-18
302580   2015-01-18
302596   2014-10-15
Name: Issue Date, dtype: datetime64[ns]


## Augment Data

In [7]:
# read the weather data from the CSV file (nycweather_pesjak/nyc_w_2013.csv, nycweather_pesjak/nyc_w_2014.csv, ...)
weather_data = dd.read_csv('nycweather_pesjak/nyc_w_*.csv', 
                           dtype={'cloudcover': 'float64',
                                  'precip': 'float64',
                                  'preciptype': 'object',
                                  'snow': 'float64',
                                  'visibility': 'float64',
                                  'winddir': 'float64',
                                  'uvindex': 'float64'},
                           parse_dates = ['datetime']) 

# print the first 5 rows of the weather data
print(weather_data.head())

                          name            datetime  temp  feelslike  dew  \
0  New York, NY, United States 2013-01-01 00:00:00   3.6        0.1 -5.0   
1  New York, NY, United States 2013-01-01 01:00:00   4.2        0.3 -4.2   
2  New York, NY, United States 2013-01-01 02:00:00   3.4        0.8 -3.2   
3  New York, NY, United States 2013-01-01 03:00:00   4.2        0.7 -3.0   
4  New York, NY, United States 2013-01-01 04:00:00   4.2        0.4 -3.0   

   humidity  precip  precipprob preciptype  snow  ...  sealevelpressure  \
0     53.30     0.0           0        NaN   0.0  ...            1012.3   
1     54.48     0.0           0        NaN   0.0  ...            1011.4   
2     62.18     0.0           0        NaN   0.0  ...            1011.8   
3     59.38     0.0           0        NaN   0.0  ...            1011.6   
4     59.38     0.0           0        NaN   0.0  ...            1011.9   

   cloudcover  visibility  solarradiation  solarenergy  uvindex  severerisk  \
0       100.0

In [8]:
# produce a list of all the columns in the weather data, their data types and the number of unique values
print(weather_data.dtypes)
print(weather_data.nunique().compute())

name                string[pyarrow]
datetime             datetime64[ns]
temp                        float64
feelslike                   float64
dew                         float64
humidity                    float64
precip                      float64
precipprob                    int64
preciptype          string[pyarrow]
snow                        float64
snowdepth                   float64
windgust                    float64
windspeed                   float64
winddir                     float64
sealevelpressure            float64
cloudcover                  float64
visibility                  float64
solarradiation              float64
solarenergy                 float64
uvindex                     float64
severerisk                  float64
conditions          string[pyarrow]
icon                string[pyarrow]
stations            string[pyarrow]
dtype: object
name                    1
datetime            99324
temp                  519
feelslike             684
dew               

Task exception was never retrieved
future: <Task finished name='Task-971939' coro=<Client._gather.<locals>.wait() done, defined at c:\Users\Blaz\Desktop\Faks\Magisterij\4-Semester\Big-Data_Course\.venv\Lib\site-packages\distributed\client.py:2199> exception=AllExit()>
Traceback (most recent call last):
  File "c:\Users\Blaz\Desktop\Faks\Magisterij\4-Semester\Big-Data_Course\.venv\Lib\site-packages\distributed\client.py", line 2208, in wait
    raise AllExit()
distributed.client.AllExit


In [29]:
# Define the file path pattern to read all CSV files (adjust the path as needed)
file_pattern = 'nycweather_pesjak/nyc_w_*.csv'

# Specify the data types for each column
dtypes = {
    'datetime': 'str',  # 'datetime' will be parsed separately
    'temp': 'float64',
    'feelslike': 'float64',
    'dew': 'float64',
    'humidity': 'float64',
    'precip': 'float64',
    'precipprob': 'float64',
    'preciptype': 'object',  # String data type
    'snow': 'float64',
    'snowdepth': 'float64',
    'windgust': 'float64',
    'windspeed': 'float64',
    'winddir': 'float64',
    'sealevelpressure': 'float64',
    'cloudcover': 'float64',
    'visibility': 'float64',
    'solarradiation': 'float64',
    'solarenergy': 'float64',
    'uvindex': 'float64',
    'severerisk': 'float64',
    'conditions': 'object',  # String data type
    'icon': 'object',  # String data type
    'stations': 'object'  # String data type
}

# Read all CSV files into a single Dask DataFrame with specified dtypes
df = dd.read_csv(file_pattern, dtype=dtypes)

# Convert the 'datetime' column to datetime type
df['datetime'] = dd.to_datetime(df['datetime'])

# Drop all string columns except 'datetime'
columns_to_keep = ['datetime'] + [col for col in df.columns if df[col].dtype in ['float64', 'int64']]
df = df[columns_to_keep]

# Extract date from datetime and create a new column 'date'
df['date'] = df['datetime'].dt.date

# Drop the original 'datetime' column as it's no longer needed
df = df.drop(columns=['datetime'])

# Group by 'date' and calculate the mean for each day (but keep the 'date' column as a regular colummn for joining later)
daily_df = df.groupby('date').mean().reset_index()

# Compute the result
daily_df = daily_df.compute()

# Display the first few rows of the resulting DataFrame
print(daily_df.head())


         date      temp  feelslike        dew   humidity  precip  precipprob  \
0  2013-01-01  2.662500  -1.170833  -5.437500  55.457083     0.0         0.0   
1  2013-01-02 -2.458333  -6.925000 -11.570833  49.837917     0.0         0.0   
2  2013-01-03 -1.741667  -5.237500  -9.729167  54.984583     0.0         0.0   
3  2013-01-04  1.329167  -2.891667  -7.129167  53.453333     0.0         0.0   
4  2013-01-05  2.720833  -0.416667  -7.233333  48.238750     0.0         0.0   

   snow  snowdepth   windgust  windspeed  winddir  sealevelpressure  \
0   0.0   0.002083  36.968182  15.791667  285.875       1012.291667   
1   0.0   0.000000  32.300000  13.879167  313.625       1017.754167   
2   0.0   0.000000  29.337500  11.070833  294.575       1020.379167   
3   0.0   0.000000  37.258333  16.491667  262.625       1016.441667   
4   0.0   0.000000  33.900000  12.358333  285.250       1022.087500   

   cloudcover  visibility  solarradiation  solarenergy   uvindex  severerisk  
0   73.183333

In [32]:
# print types of the columns
print(daily_df.dtypes)

date                datetime64[ns]
temp                       float64
feelslike                  float64
dew                        float64
humidity                   float64
precip                     float64
precipprob                 float64
snow                       float64
snowdepth                  float64
windgust                   float64
windspeed                  float64
winddir                    float64
sealevelpressure           float64
cloudcover                 float64
visibility                 float64
solarradiation             float64
solarenergy                float64
uvindex                    float64
severerisk                 float64
dtype: object


In [31]:
# change the data type of the 'date' column to datetime64[ns]
daily_df['date'] = dd.to_datetime(daily_df['date'])

## Join Data

In [10]:
# Aggregate data to daily level
daily_data = data.groupby('Issue Date').agg({
    'Summons Number': 'count',  # This will be the target variable (ticket_count)    
    'Violation Hour': 'mean',
    'Violation Day': 'first',
    'Violation Month': 'first',
}).reset_index().rename(columns={'Summons Number': 'ticket_count'})

del data  # Delete the original data to free up memory

In [33]:
# print types of the columns
print(daily_data.dtypes)

Issue Date         datetime64[ns]
ticket_count                int64
Violation Hour            float64
Violation Day               int32
Violation Month             int32
dtype: object


In [34]:
# Merge the daily data with the weather data
daily_data = daily_data.merge(daily_df, left_on='Issue Date', right_on='date', how='left')

print(daily_data.head())

  Issue Date  ticket_count  Violation Hour  Violation Day  Violation Month  \
0 2013-01-07            98       10.857143              0                1   
1 2013-01-01            59       11.254237              1                1   
2 2013-01-02            80        9.912500              2                1   
3 2013-01-03            38       15.342105              3                1   
4 2013-01-04           102       12.431373              4                1   

        date      temp  feelslike        dew   humidity  ...   windgust  \
0 2013-01-07  4.837500   2.612500  -4.941667  50.187083  ...  32.400000   
1 2013-01-01  2.662500  -1.170833  -5.437500  55.457083  ...  36.968182   
2 2013-01-02 -2.458333  -6.925000 -11.570833  49.837917  ...  32.300000   
3 2013-01-03 -1.741667  -5.237500  -9.729167  54.984583  ...  29.337500   
4 2013-01-04  1.329167  -2.891667  -7.129167  53.453333  ...  37.258333   

   windspeed     winddir  sealevelpressure  cloudcover  visibility  \
0  10.5291

In [44]:
# count the number of missing values in each column
print(daily_data.isnull().sum().compute())

Issue Date          0
ticket_count        0
Violation Hour      0
Violation Day       0
Violation Month     0
temp                0
feelslike           0
dew                 0
humidity            0
precip              0
precipprob          0
snow                0
snowdepth           0
windspeed           0
winddir             0
sealevelpressure    0
cloudcover          0
visibility          0
solarradiation      0
solarenergy         0
uvindex             0
dtype: int64


In [42]:
# drop the 'date' column
daily_data = daily_data.drop(columns=['date'])
# drop sewerisk and windgust column
daily_data = daily_data.drop(columns=['severerisk', 'windgust'])

# drop all rows with missing values
daily_data = daily_data.dropna()


In [66]:
print(daily_data.head())

  Issue Date  ticket_count  Violation Hour  Violation Day  Violation Month  \
0 2013-01-07            98       10.857143              0                1   
1 2013-01-01            59       11.254237              1                1   
2 2013-01-02            80        9.912500              2                1   
3 2013-01-03            38       15.342105              3                1   
4 2013-01-04           102       12.431373              4                1   

       temp  feelslike        dew   humidity  precip  ...  snow  snowdepth  \
0  4.837500   2.612500  -4.941667  50.187083     0.0  ...   0.0   0.000000   
1  2.662500  -1.170833  -5.437500  55.457083     0.0  ...   0.0   0.002083   
2 -2.458333  -6.925000 -11.570833  49.837917     0.0  ...   0.0   0.000000   
3 -1.741667  -5.237500  -9.729167  54.984583     0.0  ...   0.0   0.000000   
4  1.329167  -2.891667  -7.129167  53.453333     0.0  ...   0.0   0.000000   

   windspeed     winddir  sealevelpressure  cloudcover  visibi

In [45]:
# Temporal train-test split
split_date = '10/8/2013'
train_data = daily_data[daily_data['Issue Date'] < split_date]
test_data = daily_data[daily_data['Issue Date'] >= split_date]

X_train = train_data.drop(['ticket_count'], axis=1)
y_train = train_data['ticket_count']
X_test = test_data.drop(['ticket_count'], axis=1)
y_test = test_data['ticket_count']

# drop Issue Date column
X_train = X_train.drop(['Issue Date'], axis=1)
X_test = X_test.drop(['Issue Date'], axis=1)

# Identify constant columns in the training set
constant_columns = X_train.columns[X_train.nunique() <= 1]

# Drop constant columns from the training and test sets
X_train = X_train.drop(columns=constant_columns)
X_test = X_test.drop(columns=constant_columns)

# Convert Dask DataFrame to Dask Array
X_train_array = X_train.to_dask_array(lengths=True)
y_train_array = y_train.to_dask_array(lengths=True)
X_test_array = X_test.to_dask_array(lengths=True)
y_test_array = y_test.to_dask_array(lengths=True)


In [57]:
# standardize the train data and apply the same transformation to the test data
mean = X_train_array.mean(axis=0)
std = X_train_array.std(axis=0)
X_train_array = (X_train_array - mean) / std
X_test_array = (X_test_array - mean) / std

In [69]:
print(mean.compute())

[1.17967294e+01 2.99283154e+00 5.14336918e+00 1.43821185e+01
 1.33333794e+01 6.34897862e+00 6.10427365e+01 1.38123357e-01
 8.40800478e+00 2.04301075e-03 1.13051530e-01 1.12351657e+01
 1.99422231e+02 1.01654140e+03 4.75149025e+01 1.43139486e+01
 2.07193663e+02 7.45319724e-01 2.06247727e+00]


### Linear Regression - Dask

In [58]:
lr = LinearRegression()

start_time = time.time()
lr.fit(X_train_array, y_train_array)
end_time = time.time()
lr_fit_time = end_time - start_time

start_time = time.time()
y_pred_lr = lr.predict(X_test_array)
end_time = time.time()
lr_predict_time = end_time - start_time

### XGBoost

In [59]:
# Model 2: XGBoost
dtrain = xgb.DMatrix(X_train.compute(), label=y_train.compute())
dtest = xgb.DMatrix(X_test.compute(), label=y_test.compute())
params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse'
}

start_time = time.time()
bst = xgb.train(params, dtrain, num_boost_round=100)
end_time = time.time()
xgb_fit_time = end_time - start_time

start_time = time.time()
y_pred_xgb = bst.predict(dtest)
end_time = time.time()
xgb_predict_time = end_time - start_time

### SGD Regression

In [60]:
# Batch size for training
batch_size = 100

# Model 1: SGDRegressor
sgd = SGDRegressor()
incremental_sgd = Incremental(sgd)

# Fit the model in batches
start_time = time.time()
for i in range(0, len(X_train_array), batch_size):
    end = i + batch_size
    X_batch = X_train_array[i:end].compute()
    y_batch = y_train_array[i:end].compute()
    incremental_sgd.partial_fit(X_batch, y_batch)
end_time = time.time()
sgd_fit_time = end_time - start_time

start_time = time.time()
y_pred_sgd = incremental_sgd.predict(X_test_array.compute())
end_time = time.time()
sgd_predict_time = end_time - start_time

## Baseline

In [61]:
# Preditc the mean for all days
y_pred_mean = np.full(len(y_test_array), y_train.mean().compute())



In [62]:
# Evaluate models
def evaluate_model(y_test, y_pred):
    return {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': root_mean_squared_error(y_test, y_pred)
    }


In [63]:
results_baseline = evaluate_model(y_test_array.compute(), y_pred_mean)
print('Baseline:', results_baseline)

Baseline: {'MAE': np.float64(2219.500372086441), 'RMSE': np.float64(2601.547033443421)}


In [64]:
# Evaluate models
def evaluate_model(y_test, y_pred):
    return {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': root_mean_squared_error(y_test, y_pred)
    }

results_lr = evaluate_model(y_test_array.compute(), y_pred_lr.compute())
results_xgb = evaluate_model(y_test_array.compute(), y_pred_xgb)
results_sgd = evaluate_model(y_test_array.compute(), y_pred_sgd)

In [65]:
# Print results
print(f"{'Method':<20}{'MAE':<25}{'RMSE':<25}{'fit time [ms]':<15}{'predict time [ms]':<15}")
print(f"{f'Linear Regression':<20}{results_lr['MAE']:<25.2f}{results_lr['RMSE']:<25.2f}{lr_fit_time:<15.2f}{lr_predict_time:<15.2f}")
print(f"{f'XGBoost':<20}{results_xgb['MAE']:<25.2f}{results_xgb['RMSE']:<25.2f}{xgb_fit_time:<15.2f}{xgb_predict_time:<15.2f}")
print(f"{f'SGDRegressor':<20}{results_sgd['MAE']:<25.2f}{results_sgd['RMSE']:<25.2f}{sgd_fit_time:<15.2f}{sgd_predict_time:<15.2f}")
print("_"*100)
print(f"{'Baseline':<20}{results_baseline['MAE']:<25.2f}{results_baseline['RMSE']:<25.2f}{0:<15}{0:<15}")

Method              MAE                      RMSE                     fit time [ms]  predict time [ms]
Linear Regression   4125.62                  5353.26                  120.17         0.00           
XGBoost             2899.72                  3915.73                  0.10           0.00           
SGDRegressor        3467.84                  4521.35                  90.60          14.87          
____________________________________________________________________________________________________
Baseline            2219.50                  2601.55                  0              0              
