In [3]:
import numpy as np
import pandas as pd

In [1]:
import time
import torch
import random
from torch import nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

In [198]:
class LSTMNet(nn.Module):
    def __init__(self, input_size=1, hidden_dim=100, output_dim=1, n_layers=1):
        super(LSTMNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.output_dim = output_dim
        self.device = torch.device('cpu')
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        #print(self.n_layers,x.shape[0],self.hidden_dim)
        #print(x.shape[0])
        hidden,cell = torch.zeros(self.n_layers, x.size(0),self.hidden_dim),torch.zeros(self.n_layers,x. size(0),self.hidden_dim)
            
        # Initialization fo hidden and cell states
        torch.nn.init.xavier_normal_(hidden)
        torch.nn.init.xavier_normal_(cell)
        
        out, (hidden, cell) = self.lstm(x, (hidden,cell))
        #print("out shape:", out[:,-1,:].shape)
        out = self.fc(out[:,-1,:])
        return out

In [131]:
# input

import pandas as pd
from tqdm import tqdm 
import numpy as np



train_df = pd.read_csv('../data/training.xls')
test_df = pd.read_csv('../data/development.csv')

def aggregate_users(df):
    columns_to_group_by_user = ['label', 'gender', 'profession', 'ideology_binary', 'ideology_multiclass']

    group = df.groupby(by = columns_to_group_by_user, dropna = False, observed = True, sort = False)

    # Custom df per user
    df_users = group[columns_to_group_by_user].agg(func = ['count'], as_index = False, observed = True).index.to_frame (index = False)

    merged_fields = []

    pbar = tqdm(df_users.iterrows(), total = df_users.shape[0], desc = "merging users")

    for index, row in pbar:
        df_user = df[(df['label'] == row['label'])]
        merged_fields.append({**row, **{field: ' [SEP] '.join (df_user[field].fillna ('')) for field in ['tweet']}})

    df = pd.DataFrame (merged_fields)
    return df

train_df = aggregate_users(train_df)
test_df = aggregate_users(test_df)

merging users: 100%|██████████| 314/314 [00:00<00:00, 478.81it/s]
merging users: 100%|██████████| 101/101 [00:00<00:00, 1305.77it/s]


In [132]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec.load("word2vec.model")

In [146]:
def return_wvecs(train_df):
    x_train = []
    for sen in tqdm(train_df.tweet):
        
        sens = []
        for ss in sen.split('[SEP]'):
            sens.extend(ss.split(' '))
        if len(sens) < 6000:
            diff= 6000 - len(sens)
            sens.extend(['[PAD]']*diff)
        else:
            sens = sens[:6000]
        #print(len(sens))
        wvs = []
        for w in sens:
            #print(w)
            try:
                wvs.append(model.wv[w])
            except:
                #print('hi', w)
                wvs.append(np.zeros(100))
        
        wvs = np.asarray(wvs)
        #print(wvs.shape)
        x_train.append(wvs)#np.mean(wvs, axis=0))
        
    x_train = np.asarray(x_train)
    return x_train

xtrain = return_wvecs(train_df)
xtest = return_wvecs(test_df)

100%|██████████| 314/314 [00:02<00:00, 116.77it/s]
100%|██████████| 101/101 [00:01<00:00, 100.37it/s]


In [147]:
xtrain.shape

(314, 6000, 100)

In [163]:
ytrain = train_df.gender
ytest = test_df.gender

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(ytrain)
ytrain = le.transform(ytrain)
ytest = le.transform(ytest)


In [244]:
class taskdata(Dataset):
    def __init__(self, x_train, y_train):
        self.xtrain = x_train
        self.ytrain = y_train
        
    def __len__(self): return len(self.xtrain)
    
    def __getitem__(self, idx):
        return self.xtrain[idx, : ,:], self.ytrain[idx]

In [245]:
traindataloader= DataLoader(taskdata(xtrain,ytrain), 32, shuffle=True)
testdataloader= DataLoader(taskdata(xtest, ytest), 1, shuffle=False)

In [277]:
def train(model, trainloader, params, num=None):
    
    #input_dim = inpdim
    device = torch.device('cpu')
    #print(f"working with {inpdim} features")#nextitem(iter(trainloader))[0].shape[2]
    output_dim = 1
    n_layers = 2
    batch_size= 32
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
    
    #if num:
        #writer = SummaryWriter(f'runs/gru_experiment_{num}')
        
        
    model.train()
    
    epoch_times  = []
    
    p =0
    
    for epoch in range(params['epochs']):
        
        start = time.time()
        avg_loss = 0
        counter = 0
        
        for x, label in tqdm(trainloader, desc='Training'):
            #print(x.shape)
            counter +=1
            p +=1
            
            optimizer.zero_grad()
            
            out = model(x.to(device).float())
            
            out = torch.squeeze(out, dim=0)
            loss = criterion(out, label.to(device))
            loss.backward()
            
            optimizer.step()
            
            avg_loss += loss.item()
            
            if counter % 200 == 0:
                print(f"Epoch : {epoch}, Step: {counter}/{len(trainloader)} ==> Avg Loss for epoch: {avg_loss/counter}")
                #if num: writer.add_scalar('training_loss', avg_loss/counter, p)
        print(f"epoch {epoch+1}, loss {avg_loss}")        
                
        current_time = time.time()
        #print(f"Total time elapsed: {current_time-start} seconds")
        epoch_times.append(current_time- start)
    
    #print(f"Total Training Time: {sum(epoch_times)} seconds")
    return model

In [278]:
params = {'epochs': 10, 
         'lr' : 0.001}

In [279]:
net = LSTMNet(input_size=100, output_dim=2)
net = train(net, traindataloader, params)

Training: 100%|██████████| 10/10 [00:34<00:00,  3.42s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 1, loss 6.815458416938782


Training: 100%|██████████| 10/10 [00:34<00:00,  3.49s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 2, loss 6.66422826051712


Training: 100%|██████████| 10/10 [00:35<00:00,  3.58s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 3, loss 6.604955196380615


Training: 100%|██████████| 10/10 [00:36<00:00,  3.65s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 4, loss 6.5525548458099365


Training: 100%|██████████| 10/10 [00:38<00:00,  3.89s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 5, loss 6.4270671010017395


Training: 100%|██████████| 10/10 [00:39<00:00,  3.93s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 6, loss 6.256490409374237


Training: 100%|██████████| 10/10 [00:41<00:00,  4.12s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 7, loss 6.157841324806213


Training: 100%|██████████| 10/10 [00:43<00:00,  4.34s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 8, loss 5.892782688140869


Training: 100%|██████████| 10/10 [00:45<00:00,  4.51s/it]
Training:   0%|          | 0/10 [00:00<?, ?it/s]

epoch 9, loss 5.7959787249565125


Training: 100%|██████████| 10/10 [00:47<00:00,  4.75s/it]

epoch 10, loss 5.651024222373962





In [273]:
def evaluate(model, testdataloader, params):
    
    device = torch.device('cpu')
    
    model.eval()
    outputs = []
    targets = []
    
    start = time.time()
    
    for x,y in tqdm(testdataloader, desc = 'Evaluating'):
        x = x.type(torch.DoubleTensor)
        #print(x.dtype)
        out = model(x.to(device).float())
        out = torch.squeeze(out, dim=0)
        #print(out.argmax())
        outputs.append(out.argmax().cpu().detach().numpy())
        targets.extend(y.numpy())
        #break
        
    return np.asarray(outputs), np.asarray(targets)

In [280]:
o,t = evaluate(net, testdataloader, params)

Evaluating: 100%|██████████| 101/101 [00:43<00:00,  2.31it/s]


In [281]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
score = f1_score(t, o, average='micro')
cr = classification_report(t, o)

print(cr)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.52      1.00      0.69        53

    accuracy                           0.52       101
   macro avg       0.26      0.50      0.34       101
weighted avg       0.28      0.52      0.36       101



In [None]:
Reference :  
    
-https://stackoverflow.com/questions/61632584/understanding-input-shape-to-pytorch-lstm
    
-https://towardsdatascience.com/text-classification-with-pytorch-7111dae111a6