In [128]:
import pandas as pd
import numpy as np
import datamanager
from datetime import datetime
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

# from importlib import reload
# reload(datamanager)

<module 'datamanager' from '/Users/ericasuh/530project/datamanager.py'>

# Import data

In [129]:
dm = datamanager.DataManager()
# dm.combine_tweets()
data, labels = dm.preprocess("data/games_with_preprocessed_tweets_all.csv")

2021-12-09 21:13:28.511330: Tokenizing tweets
2021-12-09 21:14:05.282829: Fitting tokenizer
2021-12-09 21:14:11.780694: Texts to sequences
2021-12-09 21:14:15.969340: Padding sequences
2021-12-09 21:14:16.395689: Done


In [135]:
data1, labels1 = np.array(data), np.array(labels)
X, X_test, y, y_test = train_test_split(data1, labels1, test_size=0.2, random_state=0)
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=0)
print(
    f"Train: {len(y_train)} samples\n"
    f"Dev: {len(y_dev)} samples\n"
    f"Test: {len(y_test)} samples\n"
)

Train: 2582 samples
Dev: 646 samples
Test: 807 samples



In [136]:
class DataHandler(Dataset):
    '''Handles batches of dataset'''
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

# Model

In [137]:
class NBAModel(torch.nn.Module):
    def __init__(self, args):
        super().__init__()

        # Hyperparameters
        self.batch_size  = args.batch_size
        self.hidden_dim  = args.hidden_dim
        self.LSTM_layers = args.lstm_layers
        self.input_size  = args.max_words

        self.dropout = nn.Dropout(0.5)
        self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
        self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim*2)
        self.fc2 = nn.Linear(self.hidden_dim*2, 1)

    def forward(self, x):
        # Hidden and cell state definion
        h = torch.zeros((self.LSTM_layers, x.shape[0], self.hidden_dim))
        c = torch.zeros((self.LSTM_layers, x.shape[0], self.hidden_dim))

        # Initialization fo hidden and cell states
        torch.nn.init.xavier_normal_(h)
        torch.nn.init.xavier_normal_(c)
        
        # Each sequence "x" is passed through an embedding layer
        out = self.embedding(x)
        # Feed LSTMs
        out, (hidden, cell) = self.lstm(out, (h,c))
        out = self.dropout(out)
        # The last hidden state is taken
        out = torch.relu_(self.fc1(out[:,-1,:]))
        out = self.dropout(out)
        out = torch.sigmoid(self.fc2(out))
        return out

## Model arguments

In [138]:
class Args(object):
    def __init__(self):
        self.batch_size = 64
        self.hidden_dim = 256
        self.lstm_layers = 2
        self.max_words = 200000
        self.learning_rate = 0.001 
        self.weight_decay = 0.00001
        self.beta1 = 0.9
        self.beta2 = 0.99
        self.adam_epsilon = 1e-08   
        self.logging_interval = 10 
        self.num_epochs = 10
        self.device_name = 'cuda:0'
        self.device = None

## Train

In [None]:
def load_data(X, y, batch_size=64):
    data = DataHandler(X, y)
    return DataLoader(data, batch_size=batch_size)

def predict(model, inputs, device):
    inputs = inputs.to(device)
    logits = model(inputs)
    return F.softmax(logits, -1)

def accuracy(outputs, labels):
    outputs_idx = outputs.max(1)[1].type_as(labels)
    return outputs_idx.eq(labels).float().mean()

def validate(dev_data, model):
    predictions = []
    model.eval()
    with torch.no_grad():
        for x_batch, y_batch in dev_data:
            x = x_batch.type(torch.LongTensor)
            y = y_batch.type(torch.LongTensor)
            y_pred = model(x)
            predictions += list(y_pred.detach().numpy())
    return predictions

def train(args, X_train, y_train, X_dev, y_dev):
    model = NBAModel(args)
    model = model.to(args.device)
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    
    train_data = load_data(X_train, y_train)
    dev_data = load_data(X_dev, y_dev)
    train_loss, val_loss = [], []

    for epoch in range(args.num_epochs):
        #total_train_loss, total_val_loss = 0, 0
        train_predictions = []
        
        # train
        model.train()
        for X_batch, y_batch in train_data:
            x = X_batch.type(torch.int)
            y = y_batch.type(torch.FloatTensor)
            print(f"{datetime.now()}: Predicting")
            y_pred = model(x)
            y = y.unsqueeze(1)
            print(f"{datetime.now()}: Calculating loss")
            loss = F.binary_cross_entropy(y_pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_predictions += list(y_pred.squeeze().detach().numpy())

        # dev
        print(f"{datetime.now()}: Validating")
        dev_predictions = validate(dev_data, model)
        train_acc = accuracy(y_train, train_predictions)
        dev_acc = accuracy(y_dev, dev_predictions)

        print(f"\nEpoch: {epoch + 1}/{args.num_epochs}, Loss: {loss.item():.8f}, Train accuracy: {train_acc:.8f}, Val accuracy: {dev_acc:.8f}")


In [145]:
args = Args()
train(args, X_train, y_train, X_dev, y_dev)

KeyboardInterrupt: 