In [1]:
import pandas as pd
import seaborn as sns
from pathlib import Path as path
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

Training Data

In [2]:
Xtrain = pd.read_csv(path('data/Xtrain.csv'))
Ytrain = pd.read_csv(path('data/Ytrain.csv'))
Ytrain = Ytrain.loc[:, ~Ytrain.columns.str.contains('^Unnamed')]
TrainData = Xtrain
TrainData = TrainData.merge(Ytrain, left_index= True, right_index= True)

Data Cleaning

In [3]:
def cleaning(dirty_df):
    # Removing rows with 0 hour
    dirty_df = dirty_df[dirty_df['hour'].isna() == False]
    
    # Add month, day, weekday, and intHour
    dirty_df['date'] = pd.to_datetime(dirty_df['date'], format='%Y/%m/%d')
    weekday = dirty_df['date'].dt.weekday
    dirty_df.insert(1,'weekday',weekday)
    month = dirty_df['date'].dt.month
    dirty_df.insert(2,'month',month)
    hour = pd.to_datetime(dirty_df['hour'], format='%H:%M:%S').dt.hour
    dirty_df.insert(1,'intHour',hour)
    
    # P*q0
    dirty_df.loc[(dirty_df['p1q0'].isna()) & (dirty_df['p2q0'].isna()) & (dirty_df['p3q0'].isna()), ['p1q0','p2q0','p3q0']] = 0
    dirty_df.loc[(dirty_df['p1q0'].isna() == False) & (dirty_df['p2q0'].isna()) & (dirty_df['p3q0'].isna()), ['p2q0','p3q0']] = 0
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p3q0']):
            if row['p1q0'] > row['p2q0']:
                diff = row['p1q0'] - (row['p1q0'] - row['p2q0'])
            else:
                diff = row['p2q0'] - (row['p2q0'] - row['p1q0'])
            dirty_df.at[i,'p3q0'] = diff
    # p0Q*
    dirty_df.loc[(dirty_df['p0q1'].isna()) & (dirty_df['p0q2'].isna()) & (dirty_df['p0q3'].isna()), ['p0q1','p0q2','p0q3']] = 0
    dirty_df.loc[(dirty_df['p0q1'].isna() == False) & (dirty_df['p0q2'].isna()) & (dirty_df['p0q3'].isna()), ['p0q2','p0q3']] = 0
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p0q3']):
            if row['p0q1'] > row['p0q2']:
                diff = row['p0q1'] - (row['p0q1'] - row['p0q2'])
            else:
                diff = row['p0q2'] - (row['p0q2'] - row['p0q1'])
            dirty_df.at[i,'p0q3'] = diff
            
    # Drop values
    dirty_df = dirty_df.drop('date', axis=1)
    dirty_df = dirty_df.drop('hour', axis=1)
    dirty_df = dirty_df.drop('way', axis=1)
    dirty_df = dirty_df.drop('composition', axis=1)
    dirty_df = dirty_df[dirty_df['station'] != 'AR']
    dirty_df = dirty_df[dirty_df['station'] != 'AZ']

    # Sort by station
    dirty_df = dirty_df.sort_values('station')

    return dirty_df

In [4]:
TrainData = cleaning(TrainData)
TrainData

Unnamed: 0,intHour,weekday,month,train,station,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
23231,8,4,3,48,AA,0.049000,0.05800,0.1190,0.036,0.046,0.087,0.112
21880,6,0,5,46,AA,0.045000,0.00000,0.0000,0.017,0.021,0.057,0.079
21881,6,3,3,46,AA,0.047000,0.00000,0.0000,0.025,0.030,0.069,0.088
21882,6,2,1,46,AA,0.047000,0.00000,0.0000,0.025,0.030,0.045,0.065
22493,7,0,1,47,AA,0.044000,0.08235,0.0440,0.048,0.057,0.105,0.139
...,...,...,...,...,...,...,...,...,...,...,...,...
23848,8,1,1,48,BJ,0.050000,0.10900,0.1660,0.129,0.049,0.099,0.146
23849,8,2,1,48,BJ,0.045333,0.08400,0.1755,0.124,0.042,0.094,0.143
23850,8,3,1,48,BJ,0.048000,0.16800,0.1450,0.114,0.061,0.087,0.132
23852,8,0,1,48,BJ,0.062000,0.08000,0.1520,0.134,0.052,0.102,0.153


Label Encoding Station

In [5]:
def label_enc(param):
    LabelE = LabelEncoder()
    LabelE.fit(param)
    print(param.name,LabelE.classes_)
    return LabelE.transform(param)

In [6]:
TrainData["station"] = label_enc(TrainData["station"])
TrainData

station ['AA' 'AB' 'AC' 'AD' 'AE' 'AF' 'AG' 'AH' 'AI' 'AJ' 'AK' 'AL' 'AM' 'AN'
 'AO' 'AP' 'AQ' 'AS' 'AT' 'AU' 'AW' 'AY' 'BA' 'BB' 'BC' 'BD' 'BE' 'BF'
 'BG' 'BH' 'BI' 'BJ']


Unnamed: 0,intHour,weekday,month,train,station,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
23231,8,4,3,48,0,0.049000,0.05800,0.1190,0.036,0.046,0.087,0.112
21880,6,0,5,46,0,0.045000,0.00000,0.0000,0.017,0.021,0.057,0.079
21881,6,3,3,46,0,0.047000,0.00000,0.0000,0.025,0.030,0.069,0.088
21882,6,2,1,46,0,0.047000,0.00000,0.0000,0.025,0.030,0.045,0.065
22493,7,0,1,47,0,0.044000,0.08235,0.0440,0.048,0.057,0.105,0.139
...,...,...,...,...,...,...,...,...,...,...,...,...
23848,8,1,1,48,31,0.050000,0.10900,0.1660,0.129,0.049,0.099,0.146
23849,8,2,1,48,31,0.045333,0.08400,0.1755,0.124,0.042,0.094,0.143
23850,8,3,1,48,31,0.048000,0.16800,0.1450,0.114,0.061,0.087,0.132
23852,8,0,1,48,31,0.062000,0.08000,0.1520,0.134,0.052,0.102,0.153


Model P*Q* with Stations

In [7]:
def build_model(drop_columns):
    model = TrainData.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [8]:
def train_model(model):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']

    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    LR = LinearRegression()
    LR.fit(x_train,y_train)
    y_prediction =  LR.predict(x_test)
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')

In [22]:
# kernel must be a string of the following: linear, rbf, poly, sigmoid
def train_model_svm(model, kernel):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']

    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    SVM = SVR(kernel=kernel)
    SVM.fit(x_train,y_train)
    y_prediction =  SVM.predict(x_test)
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')

# LR

In [10]:
droppedColumns = ['train']
model = build_model(droppedColumns)
train_model(model)

r2 socre is:            0.91    ->  %91.3
Mean Squard Error is:     0.0    ->  %0.2


In [11]:
droppedColumns = ['train','station']
model = build_model(droppedColumns)
train_model(model)

r2 socre is:            0.91    ->  %91.13
Mean Squard Error is:     0.0    ->  %0.2


In [12]:
droppedColumns = []
model = build_model(droppedColumns)
train_model(model)

r2 socre is:            0.92    ->  %91.5
Mean Squard Error is:     0.0    ->  %0.19


In [20]:
droppedColumns = ['train', 'station', 'month', 'weekday', 'intHour']
model = build_model(droppedColumns)
train_model(model)

r2 socre is:            0.91    ->  %91.0
Mean Squard Error is:     0.0    ->  %0.2


# SVM

In [17]:
droppedColumns = ['train']
model = build_model(droppedColumns)
train_model_svm(model, "linear")

r2 socre is:            0.89    ->  %89.34
Mean Squard Error is:     0.0    ->  %0.24


In [19]:
droppedColumns = ['train', 'station']
model = build_model(droppedColumns)
train_model_svm(model, "linear")

r2 socre is:            0.89    ->  %88.57
Mean Squard Error is:     0.0    ->  %0.26


In [25]:
droppedColumns = []
model = build_model(droppedColumns)
train_model_svm(model, "linear")

r2 socre is:            0.9    ->  %90.12
Mean Squard Error is:     0.0    ->  %0.22


In [24]:
droppedColumns = ['train', 'station', 'month', 'weekday', 'intHour']
model = build_model(droppedColumns)
train_model_svm(model, "linear")

r2 socre is:            0.89    ->  %88.61
Mean Squard Error is:     0.0    ->  %0.26


In [26]:
droppedColumns = []
model = build_model(droppedColumns)
train_model_svm(model, "rbf")

r2 socre is:            0.86    ->  %85.59
Mean Squard Error is:     0.0    ->  %0.33


In [27]:
droppedColumns = []
model = build_model(droppedColumns)
train_model_svm(model, "poly")

r2 socre is:            0.72    ->  %71.76
Mean Squard Error is:     0.01    ->  %0.64


In [28]:
droppedColumns = []
model = build_model(droppedColumns)
train_model_svm(model, "sigmoid")

r2 socre is:            -7915342.54    ->  %-791534254.14
Mean Squard Error is:     179224.32    ->  %17922431.66
