<a href="https://colab.research.google.com/github/Alam710/Deep-Learning-and-Its-Applications/blob/main/word_embedding_fastText_dan_layer_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import io
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
import os, re, csv, math, codecs
from sklearn import model_selection
from sklearn import metrics
import torch
import torch.nn as nn
import tensorflow as tf

torch.manual_seed(2301978431);

In [None]:
dataset_path = '/content/gdrive/MyDrive/fastt/IMDB Dataset.csv'
w2v_path = '/content/gdrive/MyDrive/fastt/wiki-news-300d-1M.vec'

In [None]:
df = pd.read_csv(dataset_path)

In [None]:
df.sentiment = df.sentiment.apply(lambda x: 1 if x=='positive' else 0)

df['kfold'] = -1
df = df.sample(frac=1).reset_index(drop=True)
y = df.sentiment.values
kf = model_selection.StratifiedKFold(n_splits=5)

for fold, (train_, valid_) in enumerate(kf.split(X=df, y=y)):
    df.loc[valid_, 'kfold'] = fold

In [None]:
df

Unnamed: 0,review,sentiment,kfold
0,I found out about this film because Jewish Ben...,0,0
1,I saw this at my in-laws' house one night when...,0,0
2,The idea of making a miniseries about the Berl...,0,0
3,This movie was recommended to me so we went to...,0,0
4,A 1957 Roger Corman non epic in which a sundry...,0,0
...,...,...,...
49995,"Though predictable and contrived, not a bad mo...",0,4
49996,"First off, this is the worst movie I've ever s...",0,4
49997,Definitely the worst movie I have ever seen......,0,4
49998,A bit quirky and bordering bad taste; but inte...,1,4


In [None]:
fasttext_embedding = {}
f = codecs.open(w2v_path, encoding='utf-8')
for line in tqdm(f):
  values = line.rstrip().rsplit(' ')
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  fasttext_embedding[word] = coefs
f.close()

999995it [01:09, 14368.30it/s]


In [None]:
class IMDBDataset:
  def __init__(self, reviews, targets):
    self.reviews = reviews
    self.target = targets

  def __len__(self):
    return len(self.reviews)
  
  def __getitem__(self, index):
    review = torch.tensor(self.reviews[index,:], dtype = torch.long)
    target = torch.tensor(self.target[index], dtype = torch.float)
    
    return {'review': review, 'target': target}

In [None]:
def create_embedding_matrix(word_index, embedding_dict=None, d_model=100):
  embedding_matrix = np.zeros((len(word_index) + 1, d_model))
  for word, index in word_index.items():
    if word in embedding_dict:
      embedding_matrix[index] = embedding_dict[word]
  return embedding_matrix

In [None]:
class LSTM(nn.Module):
  def __init__(self, embedding_matrix):
    super(LSTM, self).__init__()
    num_words = embedding_matrix.shape[0]
    embedding_dim = embedding_matrix.shape[1]
    self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=embedding_dim)
    self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype = torch.float32))
    self.embedding.weight.requires_grad = False
    self.lstm = nn.LSTM(embedding_dim, 128, bidirectional=True, batch_first=True)
    self.out = nn.Linear(512, 1)
  def forward(self, x):
    x = self.embedding(x)
    hidden, _ = self.lstm(x)
    avg_pool= torch.mean(hidden, 1)
    max_pool, index_max_pool = torch.max(hidden, 1)
    out = torch.cat((avg_pool, max_pool), 1)
    out = self.out(out)
    return out

In [None]:
def train(data_loader, model, optimizer, device):
  model.train()
  for data in data_loader:
    reviews = data['review']
    targets = data['target']
    reviews = reviews.to(device, dtype = torch.long)
    targets = targets.to(device, dtype = torch.float)
    optimizer.zero_grad()
    predictions = model(reviews)
    loss = nn.BCEWithLogitsLoss()(predictions, targets.view(-1,1))
    loss.backward()
    optimizer.step()

In [None]:
def evaluate(data_loader, model, device):
  final_predictions = []
  final_targets = []
  model.eval()
  with torch.no_grad():
    for data in data_loader:
      reviews = data['review']
      targets = data['target']
      reviews = reviews.to(device, dtype = torch.long)
      targets = targets.to(device, dtype=torch.float)
      predictions = model(reviews)
      predictions = predictions.cpu().numpy().tolist()
      targets = data['target'].cpu().numpy().tolist()
      final_predictions.extend(predictions)
      final_targets.extend(targets)
  return final_predictions, final_targets

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 5

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df.review.values.tolist())

In [None]:
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict=fasttext_embedding, d_model=300)

for fold in range(5):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)
    
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=MAX_LEN)
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest, maxlen=MAX_LEN)
    
    train_dataset = IMDBDataset(reviews=xtrain, targets=train_df.sentiment.values)
    
    train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, num_workers=2)
    valid_dataset = IMDBDataset(reviews=xtest, targets=valid_df.sentiment.values)
    valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size = VALID_BATCH_SIZE, num_workers=1)
    
    device = torch.device('cuda')
    model_fasttext = LSTM(embedding_matrix)
    model_fasttext.to(device)
    optimizer = torch.optim.Adam(model_fasttext.parameters(), lr=1e-3)
    
    print('training model')
   
    for epoch in range(EPOCHS):
        train(train_data_loader, model_fasttext, optimizer, device)
        outputs, targets = evaluate(valid_data_loader, model_fasttext, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f'FOLD:{fold}, epoch: {epoch}, accuracy_score: {accuracy}')

training model
FOLD:0, epoch: 0, accuracy_score: 0.8667
FOLD:0, epoch: 1, accuracy_score: 0.8818
FOLD:0, epoch: 2, accuracy_score: 0.8875
FOLD:0, epoch: 3, accuracy_score: 0.8923
FOLD:0, epoch: 4, accuracy_score: 0.8967
training model
FOLD:1, epoch: 0, accuracy_score: 0.8596
FOLD:1, epoch: 1, accuracy_score: 0.8847
FOLD:1, epoch: 2, accuracy_score: 0.892
FOLD:1, epoch: 3, accuracy_score: 0.897
FOLD:1, epoch: 4, accuracy_score: 0.8928
training model
FOLD:2, epoch: 0, accuracy_score: 0.874
FOLD:2, epoch: 1, accuracy_score: 0.8924
FOLD:2, epoch: 2, accuracy_score: 0.8994
FOLD:2, epoch: 3, accuracy_score: 0.8998
FOLD:2, epoch: 4, accuracy_score: 0.8971
training model
FOLD:3, epoch: 0, accuracy_score: 0.862
FOLD:3, epoch: 1, accuracy_score: 0.886
FOLD:3, epoch: 2, accuracy_score: 0.8909
FOLD:3, epoch: 3, accuracy_score: 0.8963
FOLD:3, epoch: 4, accuracy_score: 0.8946
training model
FOLD:4, epoch: 0, accuracy_score: 0.8676
FOLD:4, epoch: 1, accuracy_score: 0.8836
FOLD:4, epoch: 2, accuracy_s