In [100]:
import pandas as pd
import seaborn as sns
from pathlib import Path as path
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

Training Data

In [101]:
Xtrain = pd.read_csv(path('data/Xtrain.csv'))
Ytrain = pd.read_csv(path('data/Ytrain.csv'))
Ytrain = Ytrain.loc[:, ~Ytrain.columns.str.contains('^Unnamed')]
TrainData = Xtrain
TrainData = TrainData.merge(Ytrain, left_index= True, right_index= True)

Data Cleaning

In [102]:
def cleaning(dirty_df):
    # Removing rows with 0 hour
    dirty_df = dirty_df[dirty_df['hour'].isna() == False]

    # P*q0
    dirty_df.loc[(dirty_df['p1q0'].isna()) & (dirty_df['p2q0'].isna()) & (dirty_df['p3q0'].isna()), ['p1q0','p2q0','p3q0']] = 0
    dirty_df.loc[(dirty_df['p1q0'].isna() == False) & (dirty_df['p2q0'].isna()) & (dirty_df['p3q0'].isna()), ['p2q0','p3q0']] = 0
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p3q0']):
            if row['p1q0'] > row['p2q0']:
                diff = row['p1q0'] - (row['p1q0'] - row['p2q0'])
            else:
                diff = row['p2q0'] - (row['p2q0'] - row['p1q0'])
            dirty_df.at[i,'p3q0'] = diff
    # p0Q*
    dirty_df.loc[(dirty_df['p0q1'].isna()) & (dirty_df['p0q2'].isna()) & (dirty_df['p0q3'].isna()), ['p0q1','p0q2','p0q3']] = 0
    dirty_df.loc[(dirty_df['p0q1'].isna() == False) & (dirty_df['p0q2'].isna()) & (dirty_df['p0q3'].isna()), ['p0q2','p0q3']] = 0
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p0q3']):
            if row['p0q1'] > row['p0q2']:
                diff = row['p0q1'] - (row['p0q1'] - row['p0q2'])
            else:
                diff = row['p0q2'] - (row['p0q2'] - row['p0q1'])
            dirty_df.at[i,'p0q3'] = diff

    # Sort by station
    dirty_df = dirty_df.sort_values('station')
    return dirty_df

In [103]:
TrainData = cleaning(TrainData)
TrainData

Unnamed: 0,date,train,way,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
23213,2019-02-01,48,0,AA,07:00:00,2,0.04400,0.074,0.137000,0.035,0.0390,0.076,0.098
23961,2019-05-16,49,0,AA,08:00:00,2,0.08100,0.123,0.118253,0.056,0.0630,0.107,0.130
23962,2019-05-20,49,0,AA,08:00:00,2,0.07900,0.124,0.111000,0.024,0.0270,0.052,0.065
23963,2019-03-21,49,0,AA,08:00:00,2,0.08800,0.119,0.119000,0.026,0.0290,0.062,0.076
23257,2019-03-21,48,0,AA,07:00:00,2,0.04700,0.088,0.119000,0.045,0.0500,0.088,0.119
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23172,2019-03-18,47,0,BJ,07:00:00,2,0.05400,0.082,0.054000,0.142,0.0550,0.113,0.161
23171,2019-03-15,47,0,BJ,07:00:00,2,0.05300,0.064,0.053000,0.119,0.0470,0.087,0.134
23170,2019-03-13,47,0,BJ,07:00:00,2,0.05296,0.087,0.052960,0.125,0.0470,0.093,0.144
23168,2019-02-22,47,0,BJ,07:00:00,2,0.05500,0.076,0.055000,0.133,0.0612,0.106,0.144


Label Encoding Station

In [104]:
def label_enc(param):
    LabelE = LabelEncoder()
    LabelE.fit(param)
    print(param.name,LabelE.classes_)
    return LabelE.transform(param)

In [105]:
TrainData["station"] = label_enc(TrainData["station"])
TrainData

station ['AA' 'AB' 'AC' 'AD' 'AE' 'AF' 'AG' 'AH' 'AI' 'AJ' 'AK' 'AL' 'AM' 'AN'
 'AO' 'AP' 'AQ' 'AS' 'AT' 'AU' 'AW' 'AY' 'AZ' 'BA' 'BB' 'BC' 'BD' 'BE'
 'BF' 'BG' 'BH' 'BI' 'BJ']


Unnamed: 0,date,train,way,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
23213,2019-02-01,48,0,0,07:00:00,2,0.04400,0.074,0.137000,0.035,0.0390,0.076,0.098
23961,2019-05-16,49,0,0,08:00:00,2,0.08100,0.123,0.118253,0.056,0.0630,0.107,0.130
23962,2019-05-20,49,0,0,08:00:00,2,0.07900,0.124,0.111000,0.024,0.0270,0.052,0.065
23963,2019-03-21,49,0,0,08:00:00,2,0.08800,0.119,0.119000,0.026,0.0290,0.062,0.076
23257,2019-03-21,48,0,0,07:00:00,2,0.04700,0.088,0.119000,0.045,0.0500,0.088,0.119
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23172,2019-03-18,47,0,32,07:00:00,2,0.05400,0.082,0.054000,0.142,0.0550,0.113,0.161
23171,2019-03-15,47,0,32,07:00:00,2,0.05300,0.064,0.053000,0.119,0.0470,0.087,0.134
23170,2019-03-13,47,0,32,07:00:00,2,0.05296,0.087,0.052960,0.125,0.0470,0.093,0.144
23168,2019-02-22,47,0,32,07:00:00,2,0.05500,0.076,0.055000,0.133,0.0612,0.106,0.144


Model P*Q* with Stations

In [106]:
def build_model(drop_columns):
    model = TrainData.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [107]:
def train_model(model):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']

    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    LR = LinearRegression()
    LR.fit(x_train,y_train)
    y_prediction =  LR.predict(x_test)
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')

In [108]:
droppedColumns = ['date','train','hour','way','composition']
model = build_model(droppedColumns)
train_model(model)

r2 socre is:            0.91    ->  %90.93
Mean Squard Error is:     0.0    ->  %0.21


In [109]:
droppedColumns = ['date','train','hour','way','composition','station']
model = build_model(droppedColumns)
train_model(model)

r2 socre is:            0.91    ->  %90.76
Mean Squard Error is:     0.0    ->  %0.22
