# Import Libraries

In [14]:
import pandas as pd
import seaborn as sns
from pathlib import Path as path
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import math
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.impute import KNNImputer

import warnings
warnings.filterwarnings('ignore')

# Import Data

In [15]:
Xtrain = pd.read_csv(path('../data/Xtrain.csv'))
Ytrain = pd.read_csv(path('../data/Ytrain.csv'))
Xtest = pd.read_csv(path('../data/Xtest.csv'))
Ytrain = Ytrain.loc[:, ~Ytrain.columns.str.contains('^Unnamed')]
TrainData = Xtrain
TrainData = TrainData.merge(Ytrain, left_index= True, right_index= True)

# Data Cleaning

In [16]:
def cleaning(dirty_df):
    # Add lines
    line = []
    for i in dirty_df["train"]:
        if i <= 11:
            line.append(0)
        elif i <= 21:
            line.append(1)
        elif i <= 27:
            line.append(2)
        elif i == 28:
            line.append(3)
        elif i <= 33:
            line.append(4)
        elif i <= 44:
            line.append(5)
        elif i <= 50:
            line.append(6)
        else:
            line.append(7)

    dirty_df.insert(1,'line',line)
    
    p1q0Mean = dirty_df['p1q0'].mean()
    p2q0Mean = dirty_df['p2q0'].mean()
    p3q0Mean = dirty_df['p3q0'].mean()
    p0q1Mean = dirty_df['p0q1'].mean()
    p0q2Mean = dirty_df['p0q2'].mean()
    p0q3Mean = dirty_df['p0q3'].mean()
    
    # P*q0
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p1q0']) and math.isnan(row['p2q0']) and math.isnan(row['p2q0']):
            dirty_df.at[i, 'p1q0'] = (p1q0Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p2q0'] = (p2q0Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p3q0'] = (p3q0Mean + random.uniform(-0.01, 0.02))
        elif math.isnan(row['p2q0']) and math.isnan(row['p3q0']):
            dirty_df.at[i, 'p2q0'] = (dirty_df['p2q0'].mean() + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p3q0'] = (p3q0Mean + random.uniform(-0.01, 0.02))
        elif math.isnan(row['p3q0']):
            #if row['p1q0'] > row['p2q0']:
            #    diff = row['p1q0'] - (row['p1q0'] - row['p2q0'])
            #else:
            #    diff = row['p2q0'] - (row['p2q0'] - row['p1q0'])
            #dirty_df.at[i,'p3q0'] = diff
            dirty_df.at[i,'p3q0'] = (p3q0Mean + random.uniform(-0.01, 0.02))
            
    # p0Q*
    for i, row in dirty_df.iterrows():
        if math.isnan(row['p0q1']) and math.isnan(row['p0q2']) and math.isnan(row['p0q3']):
            dirty_df.at[i, 'p0q1'] = (p0q1Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p0q2'] = (p0q2Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p0q3'] = (p0q3Mean + random.uniform(-0.01, 0.02))
        elif math.isnan(row['p0q2']) and math.isnan(row['p0q3']):
            dirty_df.at[i, 'p0q2'] = (p0q2Mean + random.uniform(-0.01, 0.02))
            dirty_df.at[i, 'p0q3'] = (p0q3Mean + random.uniform(-0.01, 0.02))
        elif math.isnan(row['p0q3']):
            #if row['p0q1'] > row['p0q2']:
            #    diff = row['p0q1'] - (row['p0q1'] - row['p0q2'])
            #else:
            #    diff = row['p0q2'] - (row['p0q2'] - row['p0q1'])
            #dirty_df.at[i,'p0q3'] = diff
            dirty_df.at[i,'p0q3'] = (p0q3Mean + random.uniform(-0.01, 0.02))
                  
    # Add month, day, and weekday
    dirty_df['date'] = pd.to_datetime(dirty_df['date'], format='%Y/%m/%d')
    weekday = dirty_df['date'].dt.weekday
    dirty_df.insert(1,'weekday',weekday)
    month = dirty_df['date'].dt.month
    dirty_df.insert(2,'month',month)
    dirty_df = dirty_df.drop('date', axis=1)
    
    #hour = pd.to_datetime(dirty_df['hour'], format='%H:%M:%S').dt.hour
    #dirty_df.insert(1,'intHour',hour)
    dirty_df = dirty_df.drop('hour', axis=1)
    
    # Drop values
    dirty_df = dirty_df.drop('way', axis=1)
    dirty_df = dirty_df.drop('composition', axis=1)

    return dirty_df

In [17]:
def addDummies(smart_df):
    for i in range(smart_df["line"].min(),smart_df["line"].max() + 1):
        smart_df["line" + str(i)] = smart_df["line"] == i
        smart_df["line" + str(i)] = smart_df["line" + str(i)].astype(int)

    for i in range(smart_df["train"].min(),smart_df["train"].max() + 1):
        smart_df["train " + str(i)] = smart_df["train"] == i
        smart_df["train " + str(i)] = smart_df["train " + str(i)].astype(int)

    for i in smart_df["station"].unique():
        smart_df[str(i)] = smart_df["station"] == i
        smart_df[str(i)] = smart_df[str(i)].astype(int)
    
    smart_df = smart_df.drop('line', axis=1)
    smart_df = smart_df.drop('train', axis=1)
    smart_df = smart_df.drop('station', axis=1)
    smart_df = smart_df.drop('weekday', axis=1)
    smart_df = smart_df.drop('month', axis=1)
    #smart_df = smart_df.drop('intHour', axis=1)

    return smart_df

In [18]:
TrainData = cleaning(TrainData)
TrainData = addDummies(TrainData)
TrainData

Unnamed: 0,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,line0,line1,line2,...,AA,AC,AG,AH,AR,AU,BA,BI,BJ,AY
0,0.226379,0.249926,0.314965,0.201,0.138,0.091000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.232087,0.257172,0.330540,0.204,0.152,0.106000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.233098,0.255601,0.312062,0.213,0.153,0.111000,0.227,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.242666,0.270174,0.333192,0.213,0.152,0.108000,0.229,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.231095,0.246977,0.321468,0.210,0.147,0.096000,0.225,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31114,0.152000,0.188600,0.157000,0.080,0.100,0.219684,0.111,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31115,0.153000,0.180400,0.191000,0.089,0.121,0.200888,0.143,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31116,0.166000,0.149000,0.168000,0.099,0.129,0.207931,0.139,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31117,0.182000,0.193000,0.162000,0.074,0.101,0.207596,0.117,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,line0,line1,line2,...,AA,AC,AG,AH,AR,AU,BA,BI,BJ,AY
0,0.226379,0.249926,0.314965,0.201,0.138,0.091000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.232087,0.257172,0.330540,0.204,0.152,0.106000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.233098,0.255601,0.312062,0.213,0.153,0.111000,0.227,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.242666,0.270174,0.333192,0.213,0.152,0.108000,0.229,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.231095,0.246977,0.321468,0.210,0.147,0.096000,0.225,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31114,0.152000,0.188600,0.157000,0.080,0.100,0.219684,0.111,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31115,0.153000,0.180400,0.191000,0.089,0.121,0.200888,0.143,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31116,0.166000,0.149000,0.168000,0.099,0.129,0.207931,0.139,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31117,0.182000,0.193000,0.162000,0.074,0.101,0.207596,0.117,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,line0,line1,line2,...,AA,AC,AG,AH,AR,AU,BA,BI,BJ,AY
0,0.226379,0.249926,0.314965,0.201,0.138,0.091000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.232087,0.257172,0.330540,0.204,0.152,0.106000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.233098,0.255601,0.312062,0.213,0.153,0.111000,0.227,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.242666,0.270174,0.333192,0.213,0.152,0.108000,0.229,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.231095,0.246977,0.321468,0.210,0.147,0.096000,0.225,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31114,0.152000,0.188600,0.157000,0.080,0.100,0.219684,0.111,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31115,0.153000,0.180400,0.191000,0.089,0.121,0.200888,0.143,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31116,0.166000,0.149000,0.168000,0.099,0.129,0.207931,0.139,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31117,0.182000,0.193000,0.162000,0.074,0.101,0.207596,0.117,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3,p0q0,line0,line1,line2,...,AA,AC,AG,AH,AR,AU,BA,BI,BJ,AY
0,0.226379,0.249926,0.314965,0.201,0.138,0.091000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.232087,0.257172,0.330540,0.204,0.152,0.106000,0.216,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.233098,0.255601,0.312062,0.213,0.153,0.111000,0.227,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.242666,0.270174,0.333192,0.213,0.152,0.108000,0.229,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.231095,0.246977,0.321468,0.210,0.147,0.096000,0.225,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31114,0.152000,0.188600,0.157000,0.080,0.100,0.219684,0.111,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31115,0.153000,0.180400,0.191000,0.089,0.121,0.200888,0.143,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31116,0.166000,0.149000,0.168000,0.099,0.129,0.207931,0.139,1,0,0,...,0,0,0,0,0,0,0,0,0,0
31117,0.182000,0.193000,0.162000,0.074,0.101,0.207596,0.117,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Xtest = cleaning(Xtest)
Xtest = addDummies(Xtest)
Xtest

# Models

In [None]:
def build_model(drop_columns):
    model = TrainData.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [None]:
def dropXTest(drop_columns):
    model = Xtest.copy()
    model = model.drop(columns=drop_columns, axis= 1)
    return model

In [None]:
def train_lr(model, export):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']
    
    DroppedXTest = dropXTest(droppedColumns)

    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    LR = LinearRegression()
    LR.fit(x_train,y_train)
    
    if (export):
        y_prediction = LR.predict(DroppedXTest)
        exportToCSV(y_prediction)
        
    y_prediction =  LR.predict(x_test)
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)
    mae = mean_absolute_error(y_test, y_prediction)

    print(f'r2 socre is:            {round(score,2)}    ->  %{round(score*100,2)}')
    print(f'Mean Squard Error is:     {round(mse,2)}    ->  %{round(mse*100,2)}')
    print(f'Mean Absolute Error is:     {round(mae,2)}    ->  %{round(mae*100,2)}')

In [None]:
def train_dnn(model, epochs, export):
    Xtrain = model.drop(columns=['p0q0'], axis=1)
    Ytrain = model['p0q0']
    
    DroppedXTest = dropXTest(droppedColumns)
    
    x_train, x_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = 0.2, random_state = 42)
    
    tf.keras.backend.clear_session()
    tf.random.set_seed(42)
    model=keras.models.Sequential([
        keras.layers.Dense(512, input_dim = x_train.shape[1], activation='relu'),  
        keras.layers.Dense(512, input_dim = x_train.shape[1], activation='relu'),  
        keras.layers.Dense(units=256,activation='relu'),  
        keras.layers.Dense(units=256,activation='relu'),    
        keras.layers.Dense(units=128,activation='relu'),
        keras.layers.Dense(units=1, activation="linear"),
    ],name="Initial_model",)
    
    optimizer = keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='mean_absolute_error')
    
    checkpoint_name = 'Weights\checkpoint.hdf5' 
    checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_weights_only = True, save_best_only = True, mode ='auto')
    callbacks_list = [checkpoint]

    history = model.fit(x_train, y_train,
                    epochs=epochs, batch_size=1024,
                    validation_data=(x_test, y_test), 
                    callbacks=callbacks_list, 
                    verbose=1)
    
    model.load_weights('Weights\checkpoint.hdf5')
    
    if (export):
        y_prediction = model.predict(DroppedXTest)
        exportToCSV(y_prediction)
        
    y_prediction = model.predict(x_test)  
    
    score = r2_score(y_test,y_prediction)
    mse = mean_squared_error(y_test, y_prediction)
    mae = mean_absolute_error(y_test, y_prediction)

    print(f'r2 socre is:             {round(score,5)}  ->  %{round(score*100,5)}')
    print(f'Mean Squard Error is:    {round(mse,5)}  ->  %{round(mse*100,5)}')
    print(f'Mean Absolute Error is:  {round(mae,5)}  ->  %{round(mae*100,5)}')

In [None]:
def exportToCSV(data):
    data = pd.DataFrame(data).copy()
    data.rename(columns = {0:'p0q0'}, inplace = True)
    data.index = np.arange(1, len(data) + 1)
    data.to_csv("y_prediction.csv", float_format='%.3f')

# Linear Regression

In [None]:
#droppedColumns = ['line', 'weekday', 'month', 'train']
#model = build_model(droppedColumns)
#train_lr(model, True)

# Deep Neural Network

In [None]:
droppedColumns = []
model = build_model(droppedColumns)
train_dnn(model, 250, True)