# Import Libraries

In [1]:
import pandas as pd
import seaborn as sns
from pathlib import Path as path
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import math
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

# Import Data

In [2]:
Xtrain = pd.read_csv(path('data/Xtrain.csv'))
Ytrain = pd.read_csv(path('data/Ytrain.csv'))
Xtest = pd.read_csv(path('data/Xtest.csv'))
Ytrain = Ytrain.loc[:, ~Ytrain.columns.str.contains('^Unnamed')]
TrainData = Xtrain
TrainData = TrainData.merge(Ytrain, left_index= True, right_index= True)

# Data Cleaning

In [3]:
def cleaning(dirty_df):
    # Add lines
    line = []
    for i in dirty_df["train"]:
        if i <= 11:
            line.append(0)
        elif i <= 21:
            line.append(1)
        elif i <= 27:
            line.append(2)
        elif i == 28:
            line.append(3)
        elif i <= 33:
            line.append(4)
        elif i <= 44:
            line.append(5)
        elif i <= 50:
            line.append(6)
        else:
            line.append(7)

    dirty_df.insert(1,'line',line)
    
    p1q0Mean = dirty_df['p1q0'].mean()
    p2q0Mean = dirty_df['p2q0'].mean()
    p3q0Mean = dirty_df['p3q0'].mean()
    p0q1Mean = dirty_df['p0q1'].mean()
    p0q2Mean = dirty_df['p0q2'].mean()
    p0q3Mean = dirty_df['p0q3'].mean()
    
    # P*q0
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p1q0']) and math.isnan(row['p2q0']) and math.isnan(row['p2q0']):
            dirty_df.at[i, 'p1q0'] = (p1q0Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p2q0'] = (p2q0Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p3q0'] = (p3q0Mean + random.uniform(-0.01, 0.02))
        elif math.isnan(row['p2q0']) and math.isnan(row['p3q0']):
            dirty_df.at[i, 'p2q0'] = (dirty_df['p2q0'].mean() + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p3q0'] = (p3q0Mean + random.uniform(-0.01, 0.02))
        elif math.isnan(row['p3q0']):
            #if row['p1q0'] > row['p2q0']:
            #    diff = row['p1q0'] - (row['p1q0'] - row['p2q0'])
            #else:
            #    diff = row['p2q0'] - (row['p2q0'] - row['p1q0'])
            #dirty_df.at[i,'p3q0'] = diff
            dirty_df.at[i,'p3q0'] = (p3q0Mean + random.uniform(-0.01, 0.02))
            
    # p0Q*
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p0q1']) and math.isnan(row['p0q2']) and math.isnan(row['p0q3']):
            dirty_df.at[i, 'p0q1'] = (p0q1Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p0q2'] = (p0q2Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p0q3'] = (p0q3Mean + random.uniform(-0.01, 0.02))
        elif math.isnan(row['p0q2']) and math.isnan(row['p0q3']):
            dirty_df.at[i, 'p0q2'] = (p0q2Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p0q3'] = (p0q3Mean + random.uniform(-0.01, 0.02))
        elif math.isnan(row['p0q3']):
            #if row['p0q1'] > row['p0q2']:
            #    diff = row['p0q1'] - (row['p0q1'] - row['p0q2'])
            #else:
            #    diff = row['p0q2'] - (row['p0q2'] - row['p0q1'])
            #dirty_df.at[i,'p0q3'] = diff
            dirty_df.at[i,'p0q3'] = (p0q3Mean + random.uniform(-0.01, 0.02))
                  
    # Add month, day, and weekday
    dirty_df['date'] = pd.to_datetime(dirty_df['date'], format='%Y/%m/%d')
    weekday = dirty_df['date'].dt.weekday
    dirty_df.insert(1,'weekday',weekday)
    month = dirty_df['date'].dt.month
    dirty_df.insert(2,'month',month)
    dirty_df = dirty_df.drop('date', axis=1)
    
    #hour = pd.to_datetime(dirty_df['hour'], format='%H:%M:%S').dt.hour
    #dirty_df.insert(1,'intHour',hour)
    dirty_df = dirty_df.drop('hour', axis=1)
    
    # Drop values
    dirty_df = dirty_df.drop('way', axis=1)
    dirty_df = dirty_df.drop('composition', axis=1)

    return dirty_df

In [4]:
def addDummies(smart_df):
    for i in range(smart_df["line"].min(),smart_df["line"].max() + 1):
        smart_df["line" + str(i)] = smart_df["line"] == i
        smart_df["line" + str(i)] = smart_df["line" + str(i)].astype(int)

    for i in range(smart_df["train"].min(),smart_df["train"].max() + 1):
        smart_df["train " + str(i)] = smart_df["train"] == i
        smart_df["train " + str(i)] = smart_df["train " + str(i)].astype(int)

    for i in smart_df["station"].unique():
        smart_df[str(i)] = smart_df["station"] == i
        smart_df[str(i)] = smart_df[str(i)].astype(int)
    
    smart_df = smart_df.drop('line', axis=1)
    smart_df = smart_df.drop('train', axis=1)
    smart_df = smart_df.drop('station', axis=1)
    smart_df = smart_df.drop('weekday', axis=1)
    smart_df = smart_df.drop('month', axis=1)
    #smart_df = smart_df.drop('intHour', axis=1)

    return smart_df

In [5]:
TrainData = cleaning(TrainData)
TrainData = addDummies(TrainData)
TrainData

Unnamed: 0,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,line0,line1,line2,...,AA,AC,AG,AH,AR,AU,BA,BI,BJ,AY
0,0.234597,0.259206,0.316861,0.201,0.138,0.091000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.237137,0.245509,0.307697,0.204,0.152,0.106000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.245308,0.268931,0.322180,0.213,0.153,0.111000,0.227,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.253190,0.257679,0.319053,0.213,0.152,0.108000,0.229,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.240139,0.264709,0.308701,0.210,0.147,0.096000,0.225,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31114,0.152000,0.188600,0.157000,0.080,0.100,0.205224,0.111,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31115,0.153000,0.180400,0.191000,0.089,0.121,0.227967,0.143,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31116,0.166000,0.149000,0.168000,0.099,0.129,0.225256,0.139,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31117,0.182000,0.193000,0.162000,0.074,0.101,0.213067,0.117,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
Xtest = cleaning(Xtest)
Xtest = addDummies(Xtest)
Xtest

Unnamed: 0,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,line0,line1,line2,line3,...,AA,AC,AG,AH,AR,AU,BA,BI,BJ,AY
0,0.224146,0.245719,0.281729,0.226,0.165,0.128000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.226335,0.229157,0.286127,0.221,0.159,0.114000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.230541,0.226618,0.293455,0.227,0.175,0.133000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.221024,0.232284,0.291673,0.200,0.148,0.105000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.223783,0.244164,0.290601,0.221,0.169,0.126000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13747,0.121000,0.145000,0.157150,0.071,0.095,0.198016,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13748,0.144000,0.139000,0.153000,0.079,0.102,0.179191,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13749,0.128000,0.137000,0.127000,0.072,0.089,0.189374,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13750,0.164000,0.187000,0.155000,0.100,0.130,0.180171,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Models

In [7]:
def build_model(drop_columns):
    model = TrainData.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [8]:
def dropXTest(drop_columns):
    model = Xtest.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [9]:
def train_lr(model, export):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']
    
    DroppedXTest = dropXTest(droppedColumns)

    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    LR = LinearRegression()
    LR.fit(x_train,y_train)
    
    if (export):
        y_prediction = LR.predict(DroppedXTest)
        exportToCSV(y_prediction)
        
    y_prediction =  LR.predict(x_test)
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)
    mae = mean_absolute_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')
    print(f'Mean Absolute Error is:     {round(mae,2)}    ->  %{round(mae*100,2)}')

In [10]:
def train_dnn(model, epochs, export):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']
    
    DroppedXTest = dropXTest(droppedColumns)
    
    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    
    tf.keras.backend.clear_session()
    tf.random.set_seed(42)
    model=keras.models.Sequential([
        keras.layers.Dense(512, input_dim = x_train.shape[1], activation='relu'),  
        keras.layers.Dense(512, input_dim = x_train.shape[1], activation='relu'),  
        keras.layers.Dense(units=256,activation='relu'),  
        keras.layers.Dense(units=256,activation='relu'),    
        keras.layers.Dense(units=128,activation='relu'),
        keras.layers.Dense(units=1, activation="linear"),
    ],name="Initial_model",)
    
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='mean_absolute_error')
    
    checkpoint_name = 'Weights\checkpoint.hdf5' 
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_weights_only = True, save_best_only = True, mode ='auto')
    callbacks_list = [checkpoint]

    history = model.fit(x_train, y_train,
                    epochs=epochs, batch_size=1024,
                    validation_data=(x_test, y_test), 
                    callbacks=callbacks_list, 
                    verbose=1)
    
    model.load_weights('Weights\checkpoint.hdf5')
    
    if (export):
        y_prediction = model.predict(DroppedXTest)
        exportToCSV(y_prediction)
        
    y_prediction = model.predict(x_test)  
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)
    mae = mean_absolute_error(y_test, y_prediction)

    print(f'r2 socre is:             {round(score,5)}  ->  %{round(score*100,5)}')
    print(f'Mean Squard Error is:    {round(mse,5)}  ->  %{round(mse*100,5)}')
    print(f'Mean Absolute Error is:  {round(mae,5)}  ->  %{round(mae*100,5)}')

In [11]:
def exportToCSV(data):
    data = pd.DataFrame(data).copy()
    data.rename(columns = {0:'p0q0'}, inplace = True)
    data.index = np.arange(1, len(data) + 1)
    data.to_csv("y_prediction.csv", float_format='%.3f')

# Linear Regression

In [12]:
#droppedColumns = ['line', 'weekday', 'month', 'train']
#model = build_model(droppedColumns)
#train_lr(model, True)

# Deep Neural Network

In [13]:
droppedColumns = []
model = build_model(droppedColumns)
train_dnn(model, 250, True)

Epoch 1/250
Epoch 1: val_loss improved from inf to 0.02741, saving model to Weights\checkpoint.hdf5
Epoch 2/250
Epoch 2: val_loss improved from 0.02741 to 0.01853, saving model to Weights\checkpoint.hdf5
Epoch 3/250
Epoch 3: val_loss improved from 0.01853 to 0.01637, saving model to Weights\checkpoint.hdf5
Epoch 4/250
Epoch 4: val_loss did not improve from 0.01637
Epoch 5/250
Epoch 5: val_loss improved from 0.01637 to 0.01478, saving model to Weights\checkpoint.hdf5
Epoch 6/250
Epoch 6: val_loss did not improve from 0.01478
Epoch 7/250
Epoch 7: val_loss improved from 0.01478 to 0.01417, saving model to Weights\checkpoint.hdf5
Epoch 8/250
Epoch 8: val_loss improved from 0.01417 to 0.01389, saving model to Weights\checkpoint.hdf5
Epoch 9/250
Epoch 9: val_loss did not improve from 0.01389
Epoch 10/250
Epoch 10: val_loss improved from 0.01389 to 0.01276, saving model to Weights\checkpoint.hdf5
Epoch 11/250
Epoch 11: val_loss did not improve from 0.01276
Epoch 12/250
Epoch 12: val_loss did 

Epoch 37/250
Epoch 37: val_loss improved from 0.01195 to 0.01185, saving model to Weights\checkpoint.hdf5
Epoch 38/250
Epoch 38: val_loss improved from 0.01185 to 0.01168, saving model to Weights\checkpoint.hdf5
Epoch 39/250
Epoch 39: val_loss did not improve from 0.01168
Epoch 40/250
Epoch 40: val_loss did not improve from 0.01168
Epoch 41/250
Epoch 41: val_loss did not improve from 0.01168
Epoch 42/250
Epoch 42: val_loss did not improve from 0.01168
Epoch 43/250
Epoch 43: val_loss did not improve from 0.01168
Epoch 44/250
Epoch 44: val_loss did not improve from 0.01168
Epoch 45/250
Epoch 45: val_loss did not improve from 0.01168
Epoch 46/250
Epoch 46: val_loss did not improve from 0.01168
Epoch 47/250
Epoch 47: val_loss did not improve from 0.01168
Epoch 48/250
Epoch 48: val_loss did not improve from 0.01168
Epoch 49/250
Epoch 49: val_loss improved from 0.01168 to 0.01158, saving model to Weights\checkpoint.hdf5
Epoch 50/250
Epoch 50: val_loss did not improve from 0.01158
Epoch 51/25

Epoch 74/250
Epoch 74: val_loss did not improve from 0.01116
Epoch 75/250
Epoch 75: val_loss did not improve from 0.01116
Epoch 76/250
Epoch 76: val_loss did not improve from 0.01116
Epoch 77/250
Epoch 77: val_loss did not improve from 0.01116
Epoch 78/250
Epoch 78: val_loss did not improve from 0.01116
Epoch 79/250
Epoch 79: val_loss improved from 0.01116 to 0.01112, saving model to Weights\checkpoint.hdf5
Epoch 80/250
Epoch 80: val_loss did not improve from 0.01112
Epoch 81/250
Epoch 81: val_loss did not improve from 0.01112
Epoch 82/250
Epoch 82: val_loss improved from 0.01112 to 0.01111, saving model to Weights\checkpoint.hdf5
Epoch 83/250
Epoch 83: val_loss did not improve from 0.01111
Epoch 84/250
Epoch 84: val_loss did not improve from 0.01111
Epoch 85/250
Epoch 85: val_loss did not improve from 0.01111
Epoch 86/250
Epoch 86: val_loss did not improve from 0.01111
Epoch 87/250
Epoch 87: val_loss improved from 0.01111 to 0.01101, saving model to Weights\checkpoint.hdf5
Epoch 88/25

Epoch 111: val_loss did not improve from 0.01074
Epoch 112/250
Epoch 112: val_loss did not improve from 0.01074
Epoch 113/250
Epoch 113: val_loss did not improve from 0.01074
Epoch 114/250
Epoch 114: val_loss did not improve from 0.01074
Epoch 115/250
Epoch 115: val_loss did not improve from 0.01074
Epoch 116/250
Epoch 116: val_loss improved from 0.01074 to 0.01069, saving model to Weights\checkpoint.hdf5
Epoch 117/250
Epoch 117: val_loss improved from 0.01069 to 0.01061, saving model to Weights\checkpoint.hdf5
Epoch 118/250
Epoch 118: val_loss improved from 0.01061 to 0.01048, saving model to Weights\checkpoint.hdf5
Epoch 119/250
Epoch 119: val_loss did not improve from 0.01048
Epoch 120/250
Epoch 120: val_loss did not improve from 0.01048
Epoch 121/250
Epoch 121: val_loss did not improve from 0.01048
Epoch 122/250
Epoch 122: val_loss did not improve from 0.01048
Epoch 123/250
Epoch 123: val_loss did not improve from 0.01048
Epoch 124/250
Epoch 124: val_loss did not improve from 0.010

Epoch 148/250
Epoch 148: val_loss did not improve from 0.01027
Epoch 149/250
Epoch 149: val_loss did not improve from 0.01027
Epoch 150/250
Epoch 150: val_loss did not improve from 0.01027
Epoch 151/250
Epoch 151: val_loss did not improve from 0.01027
Epoch 152/250
Epoch 152: val_loss did not improve from 0.01027
Epoch 153/250
Epoch 153: val_loss did not improve from 0.01027
Epoch 154/250
Epoch 154: val_loss did not improve from 0.01027
Epoch 155/250
Epoch 155: val_loss improved from 0.01027 to 0.01026, saving model to Weights\checkpoint.hdf5
Epoch 156/250
Epoch 156: val_loss did not improve from 0.01026
Epoch 157/250
Epoch 157: val_loss did not improve from 0.01026
Epoch 158/250
Epoch 158: val_loss did not improve from 0.01026
Epoch 159/250
Epoch 159: val_loss did not improve from 0.01026
Epoch 160/250
Epoch 160: val_loss did not improve from 0.01026
Epoch 161/250
Epoch 161: val_loss did not improve from 0.01026
Epoch 162/250
Epoch 162: val_loss improved from 0.01026 to 0.01011, savin

Epoch 185: val_loss did not improve from 0.01004
Epoch 186/250
Epoch 186: val_loss did not improve from 0.01004
Epoch 187/250
Epoch 187: val_loss did not improve from 0.01004
Epoch 188/250
Epoch 188: val_loss did not improve from 0.01004
Epoch 189/250
Epoch 189: val_loss improved from 0.01004 to 0.00998, saving model to Weights\checkpoint.hdf5
Epoch 190/250
Epoch 190: val_loss did not improve from 0.00998
Epoch 191/250
Epoch 191: val_loss did not improve from 0.00998
Epoch 192/250
Epoch 192: val_loss did not improve from 0.00998
Epoch 193/250
Epoch 193: val_loss did not improve from 0.00998
Epoch 194/250
Epoch 194: val_loss did not improve from 0.00998
Epoch 195/250
Epoch 195: val_loss did not improve from 0.00998
Epoch 196/250
Epoch 196: val_loss did not improve from 0.00998
Epoch 197/250
Epoch 197: val_loss did not improve from 0.00998
Epoch 198/250
Epoch 198: val_loss did not improve from 0.00998
Epoch 199/250
Epoch 199: val_loss did not improve from 0.00998
Epoch 200/250
Epoch 200:

Epoch 223/250
Epoch 223: val_loss did not improve from 0.00988
Epoch 224/250
Epoch 224: val_loss did not improve from 0.00988
Epoch 225/250
Epoch 225: val_loss did not improve from 0.00988
Epoch 226/250
Epoch 226: val_loss did not improve from 0.00988
Epoch 227/250
Epoch 227: val_loss did not improve from 0.00988
Epoch 228/250
Epoch 228: val_loss did not improve from 0.00988
Epoch 229/250
Epoch 229: val_loss did not improve from 0.00988
Epoch 230/250
Epoch 230: val_loss did not improve from 0.00988
Epoch 231/250
Epoch 231: val_loss did not improve from 0.00988
Epoch 232/250
Epoch 232: val_loss did not improve from 0.00988
Epoch 233/250
Epoch 233: val_loss did not improve from 0.00988
Epoch 234/250
Epoch 234: val_loss did not improve from 0.00988
Epoch 235/250
Epoch 235: val_loss did not improve from 0.00988
Epoch 236/250
Epoch 236: val_loss did not improve from 0.00988
Epoch 237/250
Epoch 237: val_loss did not improve from 0.00988
Epoch 238/250
Epoch 238: val_loss did not improve from 