In [109]:
import numpy as np 
import pandas as pd
import torch 

# 1. Loading Data

In [110]:
X = pd.read_csv("./Xtrain_hgcGIrA.csv")
X.head()

Unnamed: 0,date,train,way,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3
0,2019-01-07,1,0,AD,06:00:00,2,,,,0.201,0.138,0.091
1,2019-01-08,1,0,AD,06:00:00,2,,,,0.204,0.152,0.106
2,2019-01-10,1,0,AD,06:00:00,2,,,,0.213,0.153,0.111
3,2019-01-11,1,0,AD,06:00:00,2,,,,0.213,0.152,0.108
4,2019-01-14,1,0,AD,06:00:00,2,,,,0.21,0.147,0.096


In [111]:
# shape of the data 
X.shape

(31119, 12)

# 2. Cleaning and Preparing data

In [113]:
C = True

for i in range(X["composition"].shape[0]):
    if not X["composition"][i] == 2:
        C = False
        break

C

False

In [114]:
D = True

for i in range(X["way"].shape[0]):
    if not X["way"][i] == 0:
        D = False
        break

D

True

In [115]:
X = X.drop( ["way"], axis=1)
X.head()

Unnamed: 0,date,train,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3
0,2019-01-07,1,AD,06:00:00,2,,,,0.201,0.138,0.091
1,2019-01-08,1,AD,06:00:00,2,,,,0.204,0.152,0.106
2,2019-01-10,1,AD,06:00:00,2,,,,0.213,0.153,0.111
3,2019-01-11,1,AD,06:00:00,2,,,,0.213,0.152,0.108
4,2019-01-14,1,AD,06:00:00,2,,,,0.21,0.147,0.096


In [116]:
X["train"].unique()

array([ 1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,  2, 20, 21, 22, 23, 24,
       25, 26, 27, 28, 29,  3, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,  4,
       40, 41, 42, 43, 44, 45, 46, 47, 48, 49,  5, 50, 51, 52, 53, 54, 55,
        6,  7,  8,  9], dtype=int64)

In [117]:
X["station"].unique()

array(['AD', 'AI', 'AJ', 'AK', 'AM', 'AT', 'AW', 'AX', 'BB', 'BD', 'BE',
       'AE', 'AL', 'AO', 'AQ', 'BC', 'AB', 'AN', 'AS', 'BF', 'BG', 'BH',
       'AV', 'AF', 'AP', 'AZ', 'AA', 'AC', 'AG', 'AH', 'AR', 'AU', 'BA',
       'BI', 'BJ', 'AY'], dtype=object)

In [118]:
X['date'] = pd.to_datetime(X['date'])

In [119]:
type(X['date'][1])

pandas._libs.tslibs.timestamps.Timestamp

In [120]:
X.dtypes

date           datetime64[ns]
train                   int64
station                object
hour                   object
composition             int64
p1q0                  float64
p2q0                  float64
p3q0                  float64
p0q1                  float64
p0q2                  float64
p0q3                  float64
dtype: object

In [121]:
X['station'] = pd.factorize(X['station'])[0]
X["station"] = X['station'].astype(float)

In [122]:
X["station"].unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34., 35.])

In [123]:
X["train"] = X["train"].astype(float)
X["composition"] = X["composition"].astype(float)

In [124]:
X.dtypes

date           datetime64[ns]
train                 float64
station               float64
hour                   object
composition           float64
p1q0                  float64
p2q0                  float64
p3q0                  float64
p0q1                  float64
p0q2                  float64
p0q3                  float64
dtype: object

In [125]:
tensor_dates = torch.tensor(X['date'].view('float').values)
tensor_dates

tensor([2.9161e-205, 2.9544e-205, 3.0308e-205,  ..., 1.5298e-204,
        7.4289e-205, 3.7957e-205], dtype=torch.float64)

In [126]:
(X.isna().sum()/X.shape[0]).sort_values(ascending=True)

date           0.000000
train          0.000000
station        0.000000
composition    0.000000
p1q0           0.065940
p0q1           0.102895
hour           0.102960
p2q0           0.133198
p3q0           0.198721
p0q2           0.205630
p0q3           0.308236
dtype: float64

In [128]:
mean_by_train1 = X.groupby('train')['p1q0'].mean()
X['p1q0'] = X['p1q0'].fillna(X['train'].map(mean_by_train1))

mean_by_train2 = X.groupby('train')['p2q0'].mean()
X['p2q0'] = X['p2q0'].fillna(X['train'].map(mean_by_train2))

mean_by_train3 = X.groupby('train')['p3q0'].mean()
X['p3q0'] = X['p3q0'].fillna(X['train'].map(mean_by_train3))

mean_by_train4 = X.groupby('train')['p0q1'].mean()
X['p0q1'] = X['p0q1'].fillna(X['train'].map(mean_by_train4))

mean_by_train5 = X.groupby('train')['p0q2'].mean()
X['p0q2'] = X['p0q2'].fillna(X['train'].map(mean_by_train5))

mean_by_train6 = X.groupby('train')['p0q3'].mean()
X['p0q3'] = X['p0q3'].fillna(X['train'].map(mean_by_train6))

X = X.fillna( value = 0)


In [129]:
(X.isna().sum()/X.shape[0]).sort_values(ascending=True)

date           0.0
train          0.0
station        0.0
hour           0.0
composition    0.0
p1q0           0.0
p2q0           0.0
p3q0           0.0
p0q1           0.0
p0q2           0.0
p0q3           0.0
dtype: float64

In [130]:
X

Unnamed: 0,date,train,station,hour,composition,p1q0,p2q0,p3q0,p0q1,p0q2,p0q3
0,2019-01-07,1.0,0.0,06:00:00,2.0,0.127765,0.173479,0.175463,0.201,0.138,0.091000
1,2019-01-08,1.0,0.0,06:00:00,2.0,0.127765,0.173479,0.175463,0.204,0.152,0.106000
2,2019-01-10,1.0,0.0,06:00:00,2.0,0.127765,0.173479,0.175463,0.213,0.153,0.111000
3,2019-01-11,1.0,0.0,06:00:00,2.0,0.127765,0.173479,0.175463,0.213,0.152,0.108000
4,2019-01-14,1.0,0.0,06:00:00,2.0,0.127765,0.173479,0.175463,0.210,0.147,0.096000
...,...,...,...,...,...,...,...,...,...,...,...
31114,2019-05-13,9.0,10.0,08:00:00,2.0,0.152000,0.188600,0.157000,0.080,0.100,0.259469
31115,2019-05-14,9.0,10.0,08:00:00,2.0,0.153000,0.180400,0.191000,0.089,0.121,0.259469
31116,2019-05-15,9.0,10.0,08:00:00,2.0,0.166000,0.149000,0.168000,0.099,0.129,0.259469
31117,2019-03-21,9.0,10.0,08:00:00,2.0,0.182000,0.193000,0.162000,0.074,0.101,0.259469


# 3. Train a ML model