In [77]:
import numpy as np 
import pandas as pd 
import plotly.express as px
import enlighten
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from scipy.spatial import distance


## let's try use pytorch 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

import torchvision.transforms as transforms
import torchvision.datasets as datasets

from sklearn import metrics
from sklearn import decomposition
from sklearn import manifold
import matplotlib.pyplot as plt
import numpy as np

import copy
import random
import time

In [81]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## first we preprocess the data, we separate by hour and tokenize the stations

In [159]:
demand=pd.read_csv('./../42Hacks-FlexTrains/generated_demand/demand_1000.csv')
demand['Lat']
demand['Lon']
demand['time_for_walk']=pd.to_datetime(demand['time_for_walk'])
demand['time_for_walk'].map(lambda x: x.hour)

X=np.zeros((demand.shape[0],3))
X[:,0]=demand['Lat'].to_numpy()
X[:,1]=demand['Lon'].to_numpy()
X[:,2]=demand['time_for_walk'].map(lambda x: x.hour)
#now we tokenize 
vectorizer = CountVectorizer()
vectorizer2 = CountVectorizer()
Tokens = vectorizer.fit_transform(demand['id_busStop'])
d = dict([(y,x+1) for x,y in enumerate(sorted(set(demand['id_TrainStation'])))])
Y=demand['id_TrainStation'].map(lambda x: d[x]).to_numpy()
#to know the tokenization 
#features = vectorizer.get_feature_names()
X_encoded=np.concatenate((X, Tokens.toarray()), axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,random_state=0)
x_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(Y_train).type(torch.LongTensor)
x_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(Y_test).type(torch.LongTensor)
# Now in the format they want 
train_data=[]
for i in range(x_train.shape[0]):
    train_data.append((x_train[i],y_train[i]))

#test data
test_data=[]
for i in range(x_test.shape[0]):
    test_data.append((x_test[i],y_test[i]))


In [160]:
BATCH_SIZE = 64

train_iterator = data.DataLoader(train_data, 
                                 shuffle = True, 
                                 batch_size = BATCH_SIZE)

valid_iterator = data.DataLoader(test_data, 
                                 batch_size = BATCH_SIZE)

test_iterator = data.DataLoader(test_data, 
                                batch_size = BATCH_SIZE)

## Now we train the NN mlp

In [94]:
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
                
        self.input_fc = nn.Linear(input_dim, 250)
        self.hidden_fc = nn.Linear(250, 100)
        self.output_fc = nn.Linear(100, output_dim)
        
    def forward(self, x):
        
        #x = [batch size, height, width]
        
        batch_size = x.shape[0]

        x = x.view(batch_size, -1)
        
        #x = [batch size, height * width]
        
        h_1 = F.relu(self.input_fc(x))
        
        #h_1 = [batch size, 250]

        h_2 = F.relu(self.hidden_fc(h_1))

        #h_2 = [batch size, 100]

        y_pred = self.output_fc(h_2)
        
        #y_pred = [batch size, output dim]
        
        return y_pred, h_2

In [97]:
INPUT_DIM = X_train.shape[1]
OUTPUT_DIM = Y_train.shape[1]
print(INPUT_DIM,OUTPUT_DIM )
model = MLP(INPUT_DIM, OUTPUT_DIM)

3 148


In [98]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 41,048 trainable parameters


In [101]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

In [102]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [123]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for (x, y) in iterator:
        
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
                
        y_pred, _ = model(x)
        
        loss = criterion(y_pred, y)
        
        acc = calculate_accuracy(y_pred, y)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [119]:
def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        
        for (x, y) in iterator:

            x = x.to(device)
            y = y.to(device)
            
            y_pred, _ = model(x)

            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [120]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [161]:
EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    
    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 4.681 | Train Acc: 3.12%
	 Val. Loss: 4.826 |  Val. Acc: 3.17%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 4.667 | Train Acc: 3.26%
	 Val. Loss: 4.817 |  Val. Acc: 1.17%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 4.627 | Train Acc: 2.97%
	 Val. Loss: 4.776 |  Val. Acc: 3.17%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 4.615 | Train Acc: 3.75%
	 Val. Loss: 4.773 |  Val. Acc: 1.17%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 4.577 | Train Acc: 3.65%
	 Val. Loss: 4.830 |  Val. Acc: 1.17%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 4.624 | Train Acc: 2.52%
	 Val. Loss: 4.899 |  Val. Acc: 1.56%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 4.585 | Train Acc: 2.86%
	 Val. Loss: 4.823 |  Val. Acc: 1.99%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 4.582 | Train Acc: 3.05%
	 Val. Loss: 4.788 |  Val. Acc: 1.17%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 4.538 | Train Acc: 3.20%
	 Val. Loss: 4.861 |  Val. Acc: 3.56%
Epoch: 10 | Epoch Time: 0m 0s
	Train Loss: 4.5

## Now we test the predictions 

In [93]:
pipe.score(X_test,y_test)

0.0

In [18]:
demand['time_for_walk'].map(lambda x: x.hour)

0       0
1       0
2       0
3       0
4       0
       ..
995    10
996    10
997    10
998    10
999    10
Name: time_for_walk, Length: 1000, dtype: int64

In [30]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(demand['id_busStop'])
features = vectorizer.get_feature_names()
print(X.toarray().shape)

(1000, 26)


In [8]:
demand.head()

Unnamed: 0.1,Unnamed: 0,index,Date/Time,Lat,Lon,Base,id_busStop,id_TrainStation,time_for_walk,time_for_bus
0,0,0,2014-08-01 00:03:00,40.7366,-73.9906,B02512,152_bus_M15-SBS:1,136_subway_C:0,2014-08-01 00:03:00,2014-08-01 00:06:58.175473380
1,1,3,2014-08-01 00:12:00,40.7387,-73.9856,B02512,152_bus_M15-SBS:1,39_subway_N:1,2014-08-01 00:12:00,2014-08-01 00:17:50.227169220
2,2,9,2014-08-01 00:20:00,40.7448,-73.9799,B02512,122_bus_M15:0,41_subway_Q:0,2014-08-01 00:20:00,2014-08-01 00:28:29.057943900
3,3,10,2014-08-01 00:21:00,40.7399,-74.0057,B02512,127_bus_M11:1,29_subway_A:1,2014-08-01 00:21:00,2014-08-01 00:28:40.143447780
4,4,11,2014-08-01 00:25:00,40.7651,-73.9683,B02512,86_bus_Q32:0,154_subway_1:1,2014-08-01 00:25:00,2014-08-01 00:29:55.362038280


array([[ 40.7585, -73.9869,  15.    , ...,   0.    ,   0.    ,   0.    ],
       [ 40.7359, -74.0053,  13.    , ...,   0.    ,   0.    ,   0.    ],
       [ 40.7472, -73.9898,   9.    , ...,   0.    ,   0.    ,   0.    ],
       ...,
       [ 40.7654, -73.9831,  10.    , ...,   0.    ,   0.    ,   0.    ],
       [ 40.756 , -73.9871,   0.    , ...,   0.    ,   0.    ,   0.    ],
       [ 40.7416, -74.0036,  14.    , ...,   0.    ,   0.    ,   0.    ]])