In [1]:
import pandas as pd
import seaborn as sns
from pathlib import Path as path
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

Training Data

In [2]:
Xtrain = pd.read_csv(path('data/Xtrain.csv'))
Ytrain = pd.read_csv(path('data/Ytrain.csv'))
Ytrain = Ytrain.loc[:, ~Ytrain.columns.str.contains('^Unnamed')]
TrainData = Xtrain
TrainData = TrainData.merge(Ytrain, left_index= True, right_index= True)

Data Cleaning

In [3]:
def cleaning(dirty_df):
    # Removing rows with 0 hour
    dirty_df = dirty_df[dirty_df['hour'].isna() == False]
    
    # Add month, day, weekday, and intHour
    dirty_df['date'] = pd.to_datetime(dirty_df['date'], format='%Y/%m/%d')
    weekday = dirty_df['date'].dt.weekday
    dirty_df.insert(1,'weekday',weekday)
    month = dirty_df['date'].dt.month
    dirty_df.insert(2,'month',month)
    hour = pd.to_datetime(dirty_df['hour'], format='%H:%M:%S').dt.hour
    dirty_df.insert(1,'intHour',hour)
    
    # Add lines
    line = []
    for i in dirty_df["train"]:
        if i <= 11:
            line.append(0)
        elif i <= 21:
            line.append(1)
        elif i <= 27:
            line.append(2)
        elif i == 28:
            line.append(3)
        elif i <= 33:
            line.append(4)
        elif i <= 44:
            line.append(5)
        elif i <= 50:
            line.append(6)
        else:
            line.append(7)

    dirty_df.insert(1,'line',line)
    
    # P*q0
    dirty_df.loc[(dirty_df['p1q0'].isna()) & (dirty_df['p2q0'].isna()) & (dirty_df['p3q0'].isna()), ['p1q0','p2q0','p3q0']] = 0
    dirty_df.loc[(dirty_df['p1q0'].isna() == False) & (dirty_df['p2q0'].isna()) & (dirty_df['p3q0'].isna()), ['p2q0','p3q0']] = 0
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p3q0']):
            if row['p1q0'] > row['p2q0']:
                diff = row['p1q0'] - (row['p1q0'] - row['p2q0'])
            else:
                diff = row['p2q0'] - (row['p2q0'] - row['p1q0'])
            dirty_df.at[i,'p3q0'] = diff
    # p0Q*
    dirty_df.loc[(dirty_df['p0q1'].isna()) & (dirty_df['p0q2'].isna()) & (dirty_df['p0q3'].isna()), ['p0q1','p0q2','p0q3']] = 0
    dirty_df.loc[(dirty_df['p0q1'].isna() == False) & (dirty_df['p0q2'].isna()) & (dirty_df['p0q3'].isna()), ['p0q2','p0q3']] = 0
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p0q3']):
            if row['p0q1'] > row['p0q2']:
                diff = row['p0q1'] - (row['p0q1'] - row['p0q2'])
            else:
                diff = row['p0q2'] - (row['p0q2'] - row['p0q1'])
            dirty_df.at[i,'p0q3'] = diff
            
    # Drop values
    dirty_df = dirty_df.drop('date', axis=1)
    dirty_df = dirty_df.drop('hour', axis=1)
    dirty_df = dirty_df.drop('way', axis=1)
    dirty_df = dirty_df.drop('composition', axis=1)
    dirty_df = dirty_df[dirty_df['station'] != 'AR']
    dirty_df = dirty_df[dirty_df['station'] != 'AZ']

    # Sort by station
    dirty_df = dirty_df.sort_values('station')

    return dirty_df

In [4]:
TrainData = cleaning(TrainData)
TrainData

Unnamed: 0,line,intHour,weekday,month,train,station,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
23231,6,8,4,3,48,AA,0.049000,0.05800,0.1190,0.036,0.046,0.087,0.112
21880,6,6,0,5,46,AA,0.045000,0.00000,0.0000,0.017,0.021,0.057,0.079
21881,6,6,3,3,46,AA,0.047000,0.00000,0.0000,0.025,0.030,0.069,0.088
21882,6,6,2,1,46,AA,0.047000,0.00000,0.0000,0.025,0.030,0.045,0.065
22493,6,7,0,1,47,AA,0.044000,0.08235,0.0440,0.048,0.057,0.105,0.139
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23848,6,8,1,1,48,BJ,0.050000,0.10900,0.1660,0.129,0.049,0.099,0.146
23849,6,8,2,1,48,BJ,0.045333,0.08400,0.1755,0.124,0.042,0.094,0.143
23850,6,8,3,1,48,BJ,0.048000,0.16800,0.1450,0.114,0.061,0.087,0.132
23852,6,8,0,1,48,BJ,0.062000,0.08000,0.1520,0.134,0.052,0.102,0.153


Label Encoding Station

In [5]:
def label_enc(param):
    LabelE = LabelEncoder()
    LabelE.fit(param)
    print(param.name,LabelE.classes_)
    return LabelE.transform(param)

In [6]:
TrainData["station"] = label_enc(TrainData["station"])
TrainData

station ['AA' 'AB' 'AC' 'AD' 'AE' 'AF' 'AG' 'AH' 'AI' 'AJ' 'AK' 'AL' 'AM' 'AN'
 'AO' 'AP' 'AQ' 'AS' 'AT' 'AU' 'AW' 'AY' 'BA' 'BB' 'BC' 'BD' 'BE' 'BF'
 'BG' 'BH' 'BI' 'BJ']


Unnamed: 0,line,intHour,weekday,month,train,station,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
23231,6,8,4,3,48,0,0.049000,0.05800,0.1190,0.036,0.046,0.087,0.112
21880,6,6,0,5,46,0,0.045000,0.00000,0.0000,0.017,0.021,0.057,0.079
21881,6,6,3,3,46,0,0.047000,0.00000,0.0000,0.025,0.030,0.069,0.088
21882,6,6,2,1,46,0,0.047000,0.00000,0.0000,0.025,0.030,0.045,0.065
22493,6,7,0,1,47,0,0.044000,0.08235,0.0440,0.048,0.057,0.105,0.139
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23848,6,8,1,1,48,31,0.050000,0.10900,0.1660,0.129,0.049,0.099,0.146
23849,6,8,2,1,48,31,0.045333,0.08400,0.1755,0.124,0.042,0.094,0.143
23850,6,8,3,1,48,31,0.048000,0.16800,0.1450,0.114,0.061,0.087,0.132
23852,6,8,0,1,48,31,0.062000,0.08000,0.1520,0.134,0.052,0.102,0.153


Model P*Q* with Stations

In [7]:
def build_model(drop_columns):
    model = TrainData.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [8]:
def train_lr(model, export):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']

    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    LR = LinearRegression()
    LR.fit(x_train,y_train)
    y_prediction =  LR.predict(x_test)
    
    if (export):
        exportToCSV(y_prediction)
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')

In [9]:
def train_dnn(model, epochs, export):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']
    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    
    tf.keras.backend.clear_session()
    tf.random.set_seed(42)
    model=keras.models.Sequential([
        keras.layers.Dense(512, input_dim = x_train.shape[1], activation='relu'),  
        keras.layers.Dense(512, input_dim = x_train.shape[1], activation='relu'),  
        keras.layers.Dense(units=256,activation='relu'),  
        keras.layers.Dense(units=256,activation='relu'),    
        keras.layers.Dense(units=128,activation='relu'),
        keras.layers.Dense(units=1, activation="linear"),
    ],name="Initial_model",)
    
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    
    checkpoint_name = 'Weights\Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
    callbacks_list = [checkpoint]

    history = model.fit(x_train, y_train,
                    epochs=epochs, batch_size=1024,
                    validation_data=(x_test, y_test), 
                    callbacks=callbacks_list, 
                    verbose=1)
    
    y_prediction = model.predict(x_test)
    
    if (export):
        exportToCSV(y_prediction)
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')

In [10]:
def exportToCSV(data):
    data = pd.DataFrame(data).copy()
    data.rename(columns = {0:'p0q0'}, inplace = True)
    data.index = np.arange(1, len(data) + 1)
    data.to_csv("y_prediction.csv")

# LR

In [11]:
droppedColumns = []
model = build_model(droppedColumns)
train_lr(model, False)

r2 socre is:            0.92    ->  %91.72
Mean Squard Error is:     0.0    ->  %0.19


# DNN

In [12]:
droppedColumns = []
model = build_model(droppedColumns)
train_dnn(model, 250, True)

Epoch 1/250
Epoch 1: val_loss improved from inf to 0.02072, saving model to Weights\Weights-001--0.02072.hdf5
Epoch 2/250
Epoch 2: val_loss improved from 0.02072 to 0.01686, saving model to Weights\Weights-002--0.01686.hdf5
Epoch 3/250
Epoch 3: val_loss improved from 0.01686 to 0.01250, saving model to Weights\Weights-003--0.01250.hdf5
Epoch 4/250
Epoch 4: val_loss improved from 0.01250 to 0.00936, saving model to Weights\Weights-004--0.00936.hdf5
Epoch 5/250
Epoch 5: val_loss improved from 0.00936 to 0.00602, saving model to Weights\Weights-005--0.00602.hdf5
Epoch 6/250
Epoch 6: val_loss improved from 0.00602 to 0.00342, saving model to Weights\Weights-006--0.00342.hdf5
Epoch 7/250
Epoch 7: val_loss improved from 0.00342 to 0.00257, saving model to Weights\Weights-007--0.00257.hdf5
Epoch 8/250
Epoch 8: val_loss improved from 0.00257 to 0.00212, saving model to Weights\Weights-008--0.00212.hdf5
Epoch 9/250
Epoch 9: val_loss improved from 0.00212 to 0.00194, saving model to Weights\Weig

Epoch 35: val_loss did not improve from 0.00110
Epoch 36/250
Epoch 36: val_loss did not improve from 0.00110
Epoch 37/250
Epoch 37: val_loss did not improve from 0.00110
Epoch 38/250
Epoch 38: val_loss improved from 0.00110 to 0.00108, saving model to Weights\Weights-038--0.00108.hdf5
Epoch 39/250
Epoch 39: val_loss did not improve from 0.00108
Epoch 40/250
Epoch 40: val_loss did not improve from 0.00108
Epoch 41/250
Epoch 41: val_loss improved from 0.00108 to 0.00105, saving model to Weights\Weights-041--0.00105.hdf5
Epoch 42/250
Epoch 42: val_loss did not improve from 0.00105
Epoch 43/250
Epoch 43: val_loss did not improve from 0.00105
Epoch 44/250
Epoch 44: val_loss improved from 0.00105 to 0.00101, saving model to Weights\Weights-044--0.00101.hdf5
Epoch 45/250
Epoch 45: val_loss did not improve from 0.00101
Epoch 46/250
Epoch 46: val_loss did not improve from 0.00101
Epoch 47/250
Epoch 47: val_loss did not improve from 0.00101
Epoch 48/250
Epoch 48: val_loss did not improve from 0.

Epoch 72/250
Epoch 72: val_loss improved from 0.00092 to 0.00090, saving model to Weights\Weights-072--0.00090.hdf5
Epoch 73/250
Epoch 73: val_loss did not improve from 0.00090
Epoch 74/250
Epoch 74: val_loss did not improve from 0.00090
Epoch 75/250
Epoch 75: val_loss did not improve from 0.00090
Epoch 76/250
Epoch 76: val_loss did not improve from 0.00090
Epoch 77/250
Epoch 77: val_loss did not improve from 0.00090
Epoch 78/250
Epoch 78: val_loss improved from 0.00090 to 0.00088, saving model to Weights\Weights-078--0.00088.hdf5
Epoch 79/250
Epoch 79: val_loss did not improve from 0.00088
Epoch 80/250
Epoch 80: val_loss did not improve from 0.00088
Epoch 81/250
Epoch 81: val_loss did not improve from 0.00088
Epoch 82/250
Epoch 82: val_loss did not improve from 0.00088
Epoch 83/250
Epoch 83: val_loss did not improve from 0.00088
Epoch 84/250
Epoch 84: val_loss did not improve from 0.00088
Epoch 85/250
Epoch 85: val_loss did not improve from 0.00088
Epoch 86/250
Epoch 86: val_loss did 

Epoch 108/250
Epoch 108: val_loss did not improve from 0.00081
Epoch 109/250
Epoch 109: val_loss did not improve from 0.00081
Epoch 110/250
Epoch 110: val_loss did not improve from 0.00081
Epoch 111/250
Epoch 111: val_loss did not improve from 0.00081
Epoch 112/250
Epoch 112: val_loss did not improve from 0.00081
Epoch 113/250
Epoch 113: val_loss did not improve from 0.00081
Epoch 114/250
Epoch 114: val_loss did not improve from 0.00081
Epoch 115/250
Epoch 115: val_loss did not improve from 0.00081
Epoch 116/250
Epoch 116: val_loss improved from 0.00081 to 0.00081, saving model to Weights\Weights-116--0.00081.hdf5
Epoch 117/250
Epoch 117: val_loss did not improve from 0.00081
Epoch 118/250
Epoch 118: val_loss did not improve from 0.00081
Epoch 119/250
Epoch 119: val_loss improved from 0.00081 to 0.00080, saving model to Weights\Weights-119--0.00080.hdf5
Epoch 120/250
Epoch 120: val_loss improved from 0.00080 to 0.00078, saving model to Weights\Weights-120--0.00078.hdf5
Epoch 121/250
Ep

Epoch 144/250
Epoch 144: val_loss improved from 0.00076 to 0.00074, saving model to Weights\Weights-144--0.00074.hdf5
Epoch 145/250
Epoch 145: val_loss did not improve from 0.00074
Epoch 146/250
Epoch 146: val_loss improved from 0.00074 to 0.00074, saving model to Weights\Weights-146--0.00074.hdf5
Epoch 147/250
Epoch 147: val_loss did not improve from 0.00074
Epoch 148/250
Epoch 148: val_loss did not improve from 0.00074
Epoch 149/250
Epoch 149: val_loss improved from 0.00074 to 0.00073, saving model to Weights\Weights-149--0.00073.hdf5
Epoch 150/250
Epoch 150: val_loss did not improve from 0.00073
Epoch 151/250
Epoch 151: val_loss did not improve from 0.00073
Epoch 152/250
Epoch 152: val_loss improved from 0.00073 to 0.00072, saving model to Weights\Weights-152--0.00072.hdf5
Epoch 153/250
Epoch 153: val_loss did not improve from 0.00072
Epoch 154/250
Epoch 154: val_loss did not improve from 0.00072
Epoch 155/250
Epoch 155: val_loss did not improve from 0.00072
Epoch 156/250
Epoch 156:

Epoch 179/250
Epoch 179: val_loss did not improve from 0.00068
Epoch 180/250
Epoch 180: val_loss did not improve from 0.00068
Epoch 181/250
Epoch 181: val_loss did not improve from 0.00068
Epoch 182/250
Epoch 182: val_loss did not improve from 0.00068
Epoch 183/250
Epoch 183: val_loss did not improve from 0.00068
Epoch 184/250
Epoch 184: val_loss did not improve from 0.00068
Epoch 185/250
Epoch 185: val_loss did not improve from 0.00068
Epoch 186/250
Epoch 186: val_loss improved from 0.00068 to 0.00065, saving model to Weights\Weights-186--0.00065.hdf5
Epoch 187/250
Epoch 187: val_loss did not improve from 0.00065
Epoch 188/250
Epoch 188: val_loss did not improve from 0.00065
Epoch 189/250
Epoch 189: val_loss did not improve from 0.00065
Epoch 190/250
Epoch 190: val_loss did not improve from 0.00065
Epoch 191/250
Epoch 191: val_loss did not improve from 0.00065
Epoch 192/250
Epoch 192: val_loss did not improve from 0.00065
Epoch 193/250
Epoch 193: val_loss did not improve from 0.00065


Epoch 214/250
Epoch 214: val_loss did not improve from 0.00062
Epoch 215/250
Epoch 215: val_loss did not improve from 0.00062
Epoch 216/250
Epoch 216: val_loss did not improve from 0.00062
Epoch 217/250
Epoch 217: val_loss did not improve from 0.00062
Epoch 218/250
Epoch 218: val_loss did not improve from 0.00062
Epoch 219/250
Epoch 219: val_loss did not improve from 0.00062
Epoch 220/250
Epoch 220: val_loss did not improve from 0.00062
Epoch 221/250
Epoch 221: val_loss did not improve from 0.00062
Epoch 222/250
Epoch 222: val_loss did not improve from 0.00062
Epoch 223/250
Epoch 223: val_loss did not improve from 0.00062
Epoch 224/250
Epoch 224: val_loss improved from 0.00062 to 0.00061, saving model to Weights\Weights-224--0.00061.hdf5
Epoch 225/250
Epoch 225: val_loss did not improve from 0.00061
Epoch 226/250
Epoch 226: val_loss did not improve from 0.00061
Epoch 227/250
Epoch 227: val_loss did not improve from 0.00061
Epoch 228/250
Epoch 228: val_loss did not improve from 0.00061


Epoch 249/250
Epoch 249: val_loss did not improve from 0.00056
Epoch 250/250
Epoch 250: val_loss did not improve from 0.00056
r2 socre is:            0.97    ->  %97.09
Mean Squard Error is:     0.0    ->  %0.07
