# Import Libraries

In [1]:
import pandas as pd
import seaborn as sns
from pathlib import Path as path
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

# Import Data

In [2]:
Xtrain = pd.read_csv(path('data/Xtrain.csv'))
Ytrain = pd.read_csv(path('data/Ytrain.csv'))
Xtest = pd.read_csv(path('data/Xtest.csv'))
Ytrain = Ytrain.loc[:, ~Ytrain.columns.str.contains('^Unnamed')]
TrainData = Xtrain
TrainData = TrainData.merge(Ytrain, left_index= True, right_index= True)

# Data Cleaning

In [3]:
def cleaning(dirty_df):
    
    # Add month, day, and weekday
    dirty_df['date'] = pd.to_datetime(dirty_df['date'], format='%Y/%m/%d')
    weekday = dirty_df['date'].dt.weekday
    dirty_df.insert(1,'weekday',weekday)
    month = dirty_df['date'].dt.month
    dirty_df.insert(2,'month',month)
    
    # Add lines
    line = []
    for i in dirty_df["train"]:
        if i <= 11:
            line.append(0)
        elif i <= 21:
            line.append(1)
        elif i <= 27:
            line.append(2)
        elif i == 28:
            line.append(3)
        elif i <= 33:
            line.append(4)
        elif i <= 44:
            line.append(5)
        elif i <= 50:
            line.append(6)
        else:
            line.append(7)

    dirty_df.insert(1,'line',line)
    
    # P*q0
    dirty_df.loc[(dirty_df['p1q0'].isna()) & (dirty_df['p2q0'].isna()) & (dirty_df['p3q0'].isna()), ['p1q0','p2q0','p3q0']] = dirty_df.fillna(dirty_df.mean())
    dirty_df.loc[(dirty_df['p1q0'].isna() == False) & (dirty_df['p2q0'].isna()) & (dirty_df['p3q0'].isna()), ['p2q0','p3q0']] = dirty_df.fillna(dirty_df.mean())
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p3q0']):
            if row['p1q0'] > row['p2q0']:
                diff = row['p1q0'] - (row['p1q0'] - row['p2q0'])
            else:
                diff = row['p2q0'] - (row['p2q0'] - row['p1q0'])
            dirty_df.at[i,'p3q0'] = diff
    # p0Q*
    dirty_df.loc[(dirty_df['p0q1'].isna()) & (dirty_df['p0q2'].isna()) & (dirty_df['p0q3'].isna()), ['p0q1','p0q2','p0q3']] = dirty_df.fillna(dirty_df.mean())
    dirty_df.loc[(dirty_df['p0q1'].isna() == False) & (dirty_df['p0q2'].isna()) & (dirty_df['p0q3'].isna()), ['p0q2','p0q3']] = dirty_df.fillna(dirty_df.mean())
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p0q3']):
            if row['p0q1'] > row['p0q2']:
                diff = row['p0q1'] - (row['p0q1'] - row['p0q2'])
            else:
                diff = row['p0q2'] - (row['p0q2'] - row['p0q1'])
            dirty_df.at[i,'p0q3'] = diff
            
    # Drop values
    dirty_df = dirty_df.drop('date', axis=1)
    dirty_df = dirty_df.drop('hour', axis=1)
    dirty_df = dirty_df.drop('way', axis=1)
    dirty_df = dirty_df.drop('composition', axis=1)
    dirty_df = dirty_df.drop('station', axis=1)

    return dirty_df

In [4]:
TrainData = cleaning(TrainData)
TrainData

Unnamed: 0,line,weekday,month,train,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0
0,0,0,1,1,0.234769,0.251392,0.316867,0.201,0.138,0.091,0.216
1,0,1,1,1,0.234769,0.251392,0.316867,0.204,0.152,0.106,0.216
2,0,3,1,1,0.234769,0.251392,0.316867,0.213,0.153,0.111,0.227
3,0,4,1,1,0.234769,0.251392,0.316867,0.213,0.152,0.108,0.229
4,0,0,1,1,0.234769,0.251392,0.316867,0.210,0.147,0.096,0.225
...,...,...,...,...,...,...,...,...,...,...,...
31114,0,0,5,9,0.152000,0.188600,0.157000,0.080,0.100,0.080,0.111
31115,0,1,5,9,0.153000,0.180400,0.191000,0.089,0.121,0.089,0.143
31116,0,2,5,9,0.166000,0.149000,0.168000,0.099,0.129,0.099,0.139
31117,0,3,3,9,0.182000,0.193000,0.162000,0.074,0.101,0.074,0.117


In [5]:
Xtest = cleaning(Xtest)
Xtest

Unnamed: 0,line,weekday,month,train,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3
0,0,1,5,1,0.212763,0.225824,0.281484,0.226,0.165,0.128
1,0,2,5,1,0.212763,0.225824,0.281484,0.221,0.159,0.114
2,0,0,5,1,0.212763,0.225824,0.281484,0.227,0.175,0.133
3,0,2,5,1,0.212763,0.225824,0.281484,0.200,0.148,0.105
4,0,0,6,1,0.212763,0.225824,0.281484,0.221,0.169,0.126
...,...,...,...,...,...,...,...,...,...,...
13747,0,2,7,9,0.121000,0.145000,0.157150,0.071,0.095,0.071
13748,0,3,7,9,0.144000,0.139000,0.153000,0.079,0.102,0.079
13749,0,4,7,9,0.128000,0.137000,0.127000,0.072,0.089,0.072
13750,0,1,5,9,0.164000,0.187000,0.155000,0.100,0.130,0.100


# Models

In [6]:
def build_model(drop_columns):
    model = TrainData.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [7]:
def dropXTest(drop_columns):
    model = Xtest.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [8]:
def train_lr(model, export):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']
    
    DroppedXTest = dropXTest(droppedColumns)

    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    LR = LinearRegression()
    LR.fit(x_train,y_train)
    
    if (export):
        y_prediction = LR.predict(DroppedXTest)
        exportToCSV(y_prediction)
        
    y_prediction =  LR.predict(x_test)
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')

In [9]:
def train_dnn(model, epochs, export):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']
    
    DroppedXTest = dropXTest(droppedColumns)
    
    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    
    tf.keras.backend.clear_session()
    tf.random.set_seed(42)
    model=keras.models.Sequential([
        keras.layers.Dense(512, input_dim = x_train.shape[1], activation='relu'),  
        keras.layers.Dense(512, input_dim = x_train.shape[1], activation='relu'),  
        keras.layers.Dense(units=256,activation='relu'),  
        keras.layers.Dense(units=256,activation='relu'),    
        keras.layers.Dense(units=128,activation='relu'),
        keras.layers.Dense(units=1, activation="linear"),
    ],name="Initial_model",)
    
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    
    checkpoint_name = 'Weights\checkpoint.hdf5' 
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_weights_only = True, save_best_only = True, mode ='auto')
    callbacks_list = [checkpoint]

    history = model.fit(x_train, y_train,
                    epochs=epochs, batch_size=1024,
                    validation_data=(x_test, y_test), 
                    callbacks=callbacks_list, 
                    verbose=1)
    
    model.load_weights('Weights\checkpoint.hdf5')
    
    if (export):
        y_prediction = model.predict(DroppedXTest)
        exportToCSV(y_prediction)
        
    y_prediction = model.predict(x_test)  
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')

In [10]:
def exportToCSV(data):
    data = pd.DataFrame(data).copy()
    data.rename(columns = {0:'p0q0'}, inplace = True)
    data.index = np.arange(1, len(data) + 1)
    data.to_csv("y_prediction.csv", float_format='%.3f')

# Linear Regression

In [16]:
droppedColumns = ['line', 'weekday', 'month', 'train']
model = build_model(droppedColumns)
train_lr(model, True)

r2 socre is:            0.75    ->  %75.13
Mean Squard Error is:     0.01    ->  %0.62


# Deep Neural Network

In [15]:
droppedColumns = ['line', 'weekday', 'month', 'train']
model = build_model(droppedColumns)
train_dnn(model, 250, True)

Epoch 1/250
Epoch 1: val_loss improved from inf to 0.00657, saving model to Weights\checkpoint.hdf5
Epoch 2/250
Epoch 2: val_loss improved from 0.00657 to 0.00451, saving model to Weights\checkpoint.hdf5
Epoch 3/250
Epoch 3: val_loss improved from 0.00451 to 0.00363, saving model to Weights\checkpoint.hdf5
Epoch 4/250
Epoch 4: val_loss improved from 0.00363 to 0.00312, saving model to Weights\checkpoint.hdf5
Epoch 5/250
Epoch 5: val_loss improved from 0.00312 to 0.00288, saving model to Weights\checkpoint.hdf5
Epoch 6/250
Epoch 6: val_loss improved from 0.00288 to 0.00225, saving model to Weights\checkpoint.hdf5
Epoch 7/250
Epoch 7: val_loss improved from 0.00225 to 0.00185, saving model to Weights\checkpoint.hdf5
Epoch 8/250
Epoch 8: val_loss did not improve from 0.00185
Epoch 9/250
Epoch 9: val_loss improved from 0.00185 to 0.00176, saving model to Weights\checkpoint.hdf5
Epoch 10/250
Epoch 10: val_loss improved from 0.00176 to 0.00144, saving model to Weights\checkpoint.hdf5
Epoch 1

Epoch 35/250
Epoch 35: val_loss did not improve from 0.00074
Epoch 36/250
Epoch 36: val_loss did not improve from 0.00074
Epoch 37/250
Epoch 37: val_loss did not improve from 0.00074
Epoch 38/250
Epoch 38: val_loss did not improve from 0.00074
Epoch 39/250
Epoch 39: val_loss did not improve from 0.00074
Epoch 40/250
Epoch 40: val_loss did not improve from 0.00074
Epoch 41/250
Epoch 41: val_loss did not improve from 0.00074
Epoch 42/250
Epoch 42: val_loss did not improve from 0.00074
Epoch 43/250
Epoch 43: val_loss did not improve from 0.00074
Epoch 44/250
Epoch 44: val_loss improved from 0.00074 to 0.00074, saving model to Weights\checkpoint.hdf5
Epoch 45/250
Epoch 45: val_loss did not improve from 0.00074
Epoch 46/250
Epoch 46: val_loss did not improve from 0.00074
Epoch 47/250
Epoch 47: val_loss did not improve from 0.00074
Epoch 48/250
Epoch 48: val_loss did not improve from 0.00074
Epoch 49/250
Epoch 49: val_loss did not improve from 0.00074
Epoch 50/250
Epoch 50: val_loss did not 

Epoch 70: val_loss improved from 0.00068 to 0.00067, saving model to Weights\checkpoint.hdf5
Epoch 71/250
Epoch 71: val_loss did not improve from 0.00067
Epoch 72/250
Epoch 72: val_loss did not improve from 0.00067
Epoch 73/250
Epoch 73: val_loss did not improve from 0.00067
Epoch 74/250
Epoch 74: val_loss did not improve from 0.00067
Epoch 75/250
Epoch 75: val_loss did not improve from 0.00067
Epoch 76/250
Epoch 76: val_loss did not improve from 0.00067
Epoch 77/250
Epoch 77: val_loss did not improve from 0.00067
Epoch 78/250
Epoch 78: val_loss did not improve from 0.00067
Epoch 79/250
Epoch 79: val_loss did not improve from 0.00067
Epoch 80/250
Epoch 80: val_loss did not improve from 0.00067
Epoch 81/250
Epoch 81: val_loss did not improve from 0.00067
Epoch 82/250
Epoch 82: val_loss did not improve from 0.00067
Epoch 83/250
Epoch 83: val_loss did not improve from 0.00067
Epoch 84/250
Epoch 84: val_loss did not improve from 0.00067
Epoch 85/250
Epoch 85: val_loss did not improve from 

Epoch 106/250
Epoch 106: val_loss did not improve from 0.00065
Epoch 107/250
Epoch 107: val_loss did not improve from 0.00065
Epoch 108/250
Epoch 108: val_loss did not improve from 0.00065
Epoch 109/250
Epoch 109: val_loss did not improve from 0.00065
Epoch 110/250
Epoch 110: val_loss did not improve from 0.00065
Epoch 111/250
Epoch 111: val_loss did not improve from 0.00065
Epoch 112/250
Epoch 112: val_loss did not improve from 0.00065
Epoch 113/250
Epoch 113: val_loss did not improve from 0.00065
Epoch 114/250
Epoch 114: val_loss did not improve from 0.00065
Epoch 115/250
Epoch 115: val_loss did not improve from 0.00065
Epoch 116/250
Epoch 116: val_loss improved from 0.00065 to 0.00063, saving model to Weights\checkpoint.hdf5
Epoch 117/250
Epoch 117: val_loss did not improve from 0.00063
Epoch 118/250
Epoch 118: val_loss did not improve from 0.00063
Epoch 119/250
Epoch 119: val_loss did not improve from 0.00063
Epoch 120/250
Epoch 120: val_loss did not improve from 0.00063
Epoch 121/

Epoch 142/250
Epoch 142: val_loss did not improve from 0.00060
Epoch 143/250
Epoch 143: val_loss did not improve from 0.00060
Epoch 144/250
Epoch 144: val_loss did not improve from 0.00060
Epoch 145/250
Epoch 145: val_loss did not improve from 0.00060
Epoch 146/250
Epoch 146: val_loss did not improve from 0.00060
Epoch 147/250
Epoch 147: val_loss did not improve from 0.00060
Epoch 148/250
Epoch 148: val_loss did not improve from 0.00060
Epoch 149/250
Epoch 149: val_loss did not improve from 0.00060
Epoch 150/250
Epoch 150: val_loss did not improve from 0.00060
Epoch 151/250
Epoch 151: val_loss did not improve from 0.00060
Epoch 152/250
Epoch 152: val_loss did not improve from 0.00060
Epoch 153/250
Epoch 153: val_loss improved from 0.00060 to 0.00058, saving model to Weights\checkpoint.hdf5
Epoch 154/250
Epoch 154: val_loss did not improve from 0.00058
Epoch 155/250
Epoch 155: val_loss did not improve from 0.00058
Epoch 156/250
Epoch 156: val_loss did not improve from 0.00058
Epoch 157/

Epoch 178/250
Epoch 178: val_loss did not improve from 0.00058
Epoch 179/250
Epoch 179: val_loss did not improve from 0.00058
Epoch 180/250
Epoch 180: val_loss improved from 0.00058 to 0.00057, saving model to Weights\checkpoint.hdf5
Epoch 181/250
Epoch 181: val_loss did not improve from 0.00057
Epoch 182/250
Epoch 182: val_loss did not improve from 0.00057
Epoch 183/250
Epoch 183: val_loss did not improve from 0.00057
Epoch 184/250
Epoch 184: val_loss did not improve from 0.00057
Epoch 185/250
Epoch 185: val_loss did not improve from 0.00057
Epoch 186/250
Epoch 186: val_loss did not improve from 0.00057
Epoch 187/250
Epoch 187: val_loss did not improve from 0.00057
Epoch 188/250
Epoch 188: val_loss did not improve from 0.00057
Epoch 189/250
Epoch 189: val_loss did not improve from 0.00057
Epoch 190/250
Epoch 190: val_loss did not improve from 0.00057
Epoch 191/250
Epoch 191: val_loss did not improve from 0.00057
Epoch 192/250
Epoch 192: val_loss did not improve from 0.00057
Epoch 193/

Epoch 214/250
Epoch 214: val_loss did not improve from 0.00057
Epoch 215/250
Epoch 215: val_loss did not improve from 0.00057
Epoch 216/250
Epoch 216: val_loss did not improve from 0.00057
Epoch 217/250
Epoch 217: val_loss did not improve from 0.00057
Epoch 218/250
Epoch 218: val_loss did not improve from 0.00057
Epoch 219/250
Epoch 219: val_loss did not improve from 0.00057
Epoch 220/250
Epoch 220: val_loss did not improve from 0.00057
Epoch 221/250
Epoch 221: val_loss did not improve from 0.00057
Epoch 222/250
Epoch 222: val_loss did not improve from 0.00057
Epoch 223/250
Epoch 223: val_loss did not improve from 0.00057
Epoch 224/250
Epoch 224: val_loss did not improve from 0.00057
Epoch 225/250
Epoch 225: val_loss did not improve from 0.00057
Epoch 226/250
Epoch 226: val_loss did not improve from 0.00057
Epoch 227/250
Epoch 227: val_loss did not improve from 0.00057
Epoch 228/250
Epoch 228: val_loss did not improve from 0.00057
Epoch 229/250
Epoch 229: val_loss did not improve from 

Epoch 250/250
Epoch 250: val_loss did not improve from 0.00057
r2 socre is:            0.98    ->  %97.7
Mean Squard Error is:     0.0    ->  %0.06
