In [51]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from pandas.plotting import scatter_matrix
from sklearn.metrics import mean_squared_error
import math

In [52]:
# Load datasets
x_train = pd.read_csv('./data/Xtrain.csv')
y_train = pd.read_csv('./data/Ytrain.csv')
x_test = pd.read_csv('./data/Xtest.csv')
y_sample = pd.read_csv('./data/Ysample.csv')

In [53]:
y_train = y_train.rename(columns={'Unnamed: 0': 'ID'})
y_sample = y_sample.rename(columns={'Unnamed: 0': 'ID'})

print(y_train.head())
print(y_sample.head())

   ID   p0q0
0   1  0.216
1   2  0.216
2   3  0.227
3   4  0.229
4   5  0.225
   ID   p0q0
0   1  0.288
1   2  0.788
2   3  0.409
3   4  0.883
4   5  0.940


In [54]:
# Verify the shape of the datasets they should have same number of rows
if x_train.shape[0] != y_train.shape[0]:
    print('Train datasets do not have same number of rows')

if x_test.shape[0] != y_sample.shape[0]:
    print('Test datasets do not have same number of rows')


df_train = x_train.join(y_train, how='inner')
df_test = x_test.join(y_sample, how='inner')

# Drop the ID column
df_train = df_train.drop(columns='ID')
df_test = df_test.drop(columns='ID')


df_train.head()

Unnamed: 0,date,train,way,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
0,2019-01-07,1,0,AD,06:00:00,2,,,,0.201,0.138,0.091,0.216
1,2019-01-08,1,0,AD,06:00:00,2,,,,0.204,0.152,0.106,0.216
2,2019-01-10,1,0,AD,06:00:00,2,,,,0.213,0.153,0.111,0.227
3,2019-01-11,1,0,AD,06:00:00,2,,,,0.213,0.152,0.108,0.229
4,2019-01-14,1,0,AD,06:00:00,2,,,,0.21,0.147,0.096,0.225


In [55]:
# Renaming columns
column_renaming = {
    'p0q0': 't0s0',
    'p1q0': 't1s0',
    'p2q0': 't2s0',
    'p3q0': 't3s0',
    'p0q1': 't0s1',
    'p0q2': 't0s2',
    'p0q3': 't0s3'
}

df_train.rename(columns=column_renaming, inplace=True)
df_test.rename(columns=column_renaming, inplace=True)

# Show columns
print(df_train.columns)
print(df_test.columns)

Index(['date', 'train', 'way', 'station', 'hour', 'composition', 't1s0',
       't2s0', 't3s0', 't0s1', 't0s2', 't0s3', 't0s0'],
      dtype='object')
Index(['date', 'train', 'way', 'station', 'hour', 'composition', 't1s0',
       't2s0', 't3s0', 't0s1', 't0s2', 't0s3', 't0s0'],
      dtype='object')


In [56]:
df_train.head()

Unnamed: 0,date,train,way,station,hour,composition,t1s0,t2s0,t3s0,t0s1,t0s2,t0s3,t0s0
0,2019-01-07,1,0,AD,06:00:00,2,,,,0.201,0.138,0.091,0.216
1,2019-01-08,1,0,AD,06:00:00,2,,,,0.204,0.152,0.106,0.216
2,2019-01-10,1,0,AD,06:00:00,2,,,,0.213,0.153,0.111,0.227
3,2019-01-11,1,0,AD,06:00:00,2,,,,0.213,0.152,0.108,0.229
4,2019-01-14,1,0,AD,06:00:00,2,,,,0.21,0.147,0.096,0.225


In [57]:
# Put the 0 when there is a missing value for lag columns
cols = ['t0s0', 't1s0', 't2s0', 't3s0', 't0s1', 't0s2', 't0s3']

for col in cols:
    df_train[col] = df_train[col].fillna(0)
    df_test[col] = df_test[col].fillna(0)

# Benchmark

In [58]:
from andrea_models import AndreaLinearRegression


# Creating a model with two features
cols = ['t1s0', 't0s1']
target = 't0s0'

bench_mark_model = AndreaLinearRegression()
bench_mark_model.fit(df_train[cols], df_train[target], column_names=cols)
bench_mark_model.summary()

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.811
Model:                            OLS   Adj. R-squared:                  0.811
Method:                 Least Squares   F-statistic:                 6.673e+04
Date:                Tue, 17 Dec 2024   Prob (F-statistic):               0.00
Time:                        11:58:10   Log-Likelihood:                 39306.
No. Observations:               31119   AIC:                        -7.861e+04
Df Residuals:                   31116   BIC:                        -7.858e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
intercept      0.0532      0.001     78.756      0.0

In [59]:
# Test the benchmark model
predictions = bench_mark_model.predict(df_test[cols])

RMSE_2_features = math.sqrt(mean_squared_error(df_test[target], predictions))
print('RMSE benchmark:', RMSE_2_features)

RMSE benchmark: 0.4137089011196723


# Benchmark - with the average

In [None]:
# Now lets use the average ocuppancy 