# Preparing data
## Data download:
https://drive.google.com/file/d/1zvc4_mKBpEhFWVju91KRIM0uQc6kR-S0/view?usp=sharing

# Note, add some commets

In [1]:
# Reading data

import pandas as pd
from tqdm import tqdm
import numpy as np

data = pd.read_csv('./IMDB_data/labeledTrain.csv')

FileNotFoundError: File b'./IMDB_data/labeledTrain.csv' does not exist

In [2]:
# For quick experiments, we only pick 100 data

train_data = data['review'].tolist()[:100]
train_target = data['sentiment'].tolist()[:100]

test_data = data['review'].tolist()[100:150]
test_target = data['sentiment'].tolist()[100:150]

In [3]:
# Removing stopwords, using NLTK
from nltk.corpus import stopwords

sw = stopwords.words("english")
stopWordDict = dict(zip(sw, list(range(len(sw)))))

In [4]:
# Checking the SW is in sentnece or not

train_remove_sw = []
for review in tqdm(train_data):
    temp = ''
    for word in review.split():
        if word not in stopWordDict:
            temp += word+' '
    train_remove_sw.append(temp.strip())
    
test_remove_sw = []
for review in tqdm(test_data):
    temp = ''
    for word in review.split():
        if word not in stopWordDict:
            temp += word+' '
    test_remove_sw.append(temp.strip())

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 12450.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 8356.52it/s]


# One-hot encoding

In [8]:
import numpy as np

#Building the dictionary(Vocabulary)
token_index = {}

for sample in tqdm(train_remove_sw):
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index)+ 1

#looking for max_length (All data time step follow this one)
max_length = 0
for s in train_remove_sw:
    if len(s)>max_length:
        max_length = len(s)

train_results = np.zeros((len(train_remove_sw), max_length, max(token_index.values())+1)) #(data_size, time_step, word_dim)
for i,smaple in tqdm(enumerate(train_remove_sw)):
    for j,word in list(enumerate(sample.split())):
        train_index = token_index.get(word)
        train_results[i,j,train_index] = 1

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 25048.10it/s]
100it [00:00, 7161.43it/s]


In [9]:
train_results.shape

(100, 3938, 4704)

In [17]:
test_results = np.zeros((len(test_remove_sw), max_length, max(token_index.values())+1))
for i,smaple in tqdm(enumerate(test_remove_sw)):
    for j,word in list(enumerate(sample.split())):
        test_index = token_index.get(word)
        test_results[i,j,test_index] = 1

50it [00:00, 6267.08it/s]


In [19]:
test_results.shape

(50, 3938, 4704)

In [20]:
# Do "mean" to represent the sentence

X_train = train_results.mean(1)
y_train = train_target

X_test = test_results.mean(1)
y_test = test_target

In [21]:
#Simple Linear
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions = clf.predict(X_test)

#Metric for CLS task
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.56      1.00      0.72        28
           1       0.00      0.00      0.00        22

    accuracy                           0.56        50
   macro avg       0.28      0.50      0.36        50
weighted avg       0.31      0.56      0.40        50



  _warn_prf(average, modifier, msg_start, len(result))


# TF-IDF

In [23]:
# From sklearn to use TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_remove_sw)
X_test = tfidf_vectorizer.transform(test_remove_sw)

In [24]:
X_train.shape

(100, 4550)

In [25]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions = clf.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.59      0.96      0.73        28
           1       0.75      0.14      0.23        22

    accuracy                           0.60        50
   macro avg       0.67      0.55      0.48        50
weighted avg       0.66      0.60      0.51        50



# N-gram

In [36]:
context_size = 1
EMBEDDING_DIM = 64

In [37]:
# Build Bi-gram data

bigram=[]
for sentence in train_remove_sw:
    tmp = sentence.split()
    for i in range(len(tmp)-1):
        bigram.append((tmp[i], tmp[i+1]))

In [38]:
bigram[1]

('going', 'moment')

In [39]:
#Build vocabulary

words=[]
for s in train_remove_sw:
    tmp = s.split()
    for w in tmp:
        words.append(w)
        
vocb = set(words)
vocb.add('unk')
word_to_idx = {word: i for i, word in enumerate(vocb)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

In [40]:
word_to_idx

{'dealing': 0,
 'represents': 1,
 'generator': 2,
 'rate': 3,
 'ending': 4,
 'pascow': 5,
 'hes': 6,
 'uniquely': 7,
 'clichés': 8,
 'charactersmy': 9,
 'section': 10,
 'ham': 11,
 'graduate': 12,
 'townein': 13,
 '1600': 14,
 'scenebottom': 15,
 'hisher': 16,
 'argues': 17,
 'nose-dives': 18,
 'botched': 19,
 'hammerhead:': 20,
 'accident': 21,
 'familial': 22,
 'enchanting': 23,
 'surprising': 24,
 'harm': 25,
 'nikah': 26,
 'claim': 27,
 'subplot': 28,
 'noted': 29,
 'eric': 30,
 'soul': 31,
 'ruth': 32,
 'incarnate': 33,
 '93%': 34,
 'restavoid': 35,
 'tackier': 36,
 'unpredictable': 37,
 'fiona': 38,
 'evocation': 39,
 'rent': 40,
 'prado': 41,
 'nudity!': 42,
 'anxious': 43,
 'free!': 44,
 'yummy': 45,
 'security': 46,
 'go': 47,
 'loving': 48,
 'want': 49,
 'cops': 50,
 'disown': 51,
 'choo-chooceleste': 52,
 'tried': 53,
 'persuasive': 54,
 'execution': 55,
 'thrown-together': 56,
 'carters': 57,
 'gigli': 58,
 'him;': 59,
 'rank': 60,
 'hodet': 61,
 'flashpoint': 62,
 'seenthe

In [42]:
#Build the bi-gram model

import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

class n_gram(nn.Module):
    def __init__(self, vocab_size, n_dim=EMBEDDING_DIM):
        super(n_gram, self).__init__()

        self.embed = nn.Embedding(vocab_size, n_dim)
        self.classify = nn.Sequential(
            nn.Linear(n_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, vocab_size)
        )

    def forward(self, x):
        voc_embed = self.embed(x)
        voc_embed2 = voc_embed.view(1, -1)
        out = self.classify(voc_embed2)
        return out

In [43]:
net = n_gram(len(word_to_idx))
net = net.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-4)

In [44]:
for e in range(20):
    net.train()
    train_loss = 0
    for word, label in tqdm(bigram):
        word = Variable(torch.LongTensor([word_to_idx[word]])).cuda()
        label = Variable(torch.LongTensor([word_to_idx[label]])).cuda()
   
        out = net(word)
        loss = criterion(out, label)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (e + 1) % 5 == 0:
        print('epoch: {}, Loss: {:.6f}'.format(e + 1, train_loss / len(bigram)))

100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:02<00:00, 172.31it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:26<00:00, 124.42it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:33<00:00, 114.04it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:35<00:00, 112.56it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:34<00:00, 112.82it/s]
  0%|                                                                              | 12/10702 [00:00<01:36, 110.82it/s]

epoch: 5, Loss: 7.534471


100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:34<00:00, 112.76it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:37<00:00, 110.11it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:35<00:00, 112.38it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:35<00:00, 111.95it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:35<00:00, 112.44it/s]
  0%|                                                                              | 12/10702 [00:00<01:30, 118.17it/s]

epoch: 10, Loss: 6.805270


100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:36<00:00, 110.58it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:33<00:00, 114.39it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:33<00:00, 114.20it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:34<00:00, 112.73it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:34<00:00, 112.65it/s]
  0%|▏                                                                             | 24/10702 [00:00<01:30, 118.50it/s]

epoch: 15, Loss: 6.540151


100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:32<00:00, 115.75it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:34<00:00, 112.67it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:36<00:00, 111.16it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:41<00:00, 105.32it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:39<00:00, 107.64it/s]

epoch: 20, Loss: 6.488505





In [45]:
net = net.eval()

In [46]:
word, label = bigram[10]
print('input: {}'.format(word))
print('label: {}'.format(label))
print('\n')

word = Variable(torch.LongTensor([word_to_idx[word]])).cuda()
out = net(word)
pred_label_idx = out.max(1)[1].item()
predict_word = idx_to_word[pred_label_idx]
print('real word is {}, predicted word is {}'.format(label, predict_word))

input: documentary
label: watched


real word is watched, predicted word is style


In [47]:
net.embed(word)

tensor([[-0.0423,  0.0522, -0.0436,  0.0518,  0.0422,  0.0138, -0.0937, -0.0321,
          0.0788,  0.0339, -0.0821,  0.0825,  0.0375, -0.0528,  0.0366,  0.0319,
         -0.0503, -0.0442, -0.0662,  0.0073, -0.0746, -0.0539,  0.0503,  0.0590,
          0.0476, -0.0331,  0.0760,  0.0718,  0.0796,  0.0286,  0.0114,  0.0614,
          0.0066, -0.0403, -0.0022, -0.0534,  0.0388,  0.0362,  0.0146,  0.1061,
         -0.1104, -0.0167, -0.0653, -0.0241,  0.1121, -0.0115, -0.0447,  0.0528,
          0.0389,  0.0418,  0.0164,  0.0319, -0.0150, -0.0541, -0.0345, -0.0407,
         -0.0325,  0.0569, -0.0245,  0.0286, -0.0717, -0.0570, -0.0874, -0.0314]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)

In [48]:
#Now we using the embedding layer to build the dataset we need

train = []
for s in train_remove_sw:
    tmp = []
    for w in s.split():
        if w not in vocb:
            word = Variable(torch.LongTensor([word_to_idx['unk']])).cuda()
            emb = net.embed(word)
            tmp.append(emb)
        else:
            word = Variable(torch.LongTensor([word_to_idx[w]])).cuda()
            emb = net.embed(word)
            tmp.append(emb)
    train.append(np.array(tmp).sum())
    
train_vec = torch.cat(train, 0)

In [62]:
test = []
for s in test_remove_sw:
    tmp = []
    for w in s.split():
        if w not in vocb:
            word = Variable(torch.LongTensor([word_to_idx['unk']])).cuda()
            emb = net.embed(word)
            tmp.append(emb)
        else:
            word = Variable(torch.LongTensor([word_to_idx[w]])).cuda()
            emb = net.embed(word)
            tmp.append(emb)
    test.append(np.array(tmp).sum())
    
test_vec = torch.cat(test, 0)

In [63]:
train_vec.shape

torch.Size([100, 64])

In [61]:
clf = LogisticRegression(random_state=0).fit(train_vec.data.cpu().numpy(), y_train)
predictions=clf.predict(test_vec.data.cpu().numpy())

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.68      0.68      0.68        28
           1       0.59      0.59      0.59        22

    accuracy                           0.64        50
   macro avg       0.63      0.63      0.63        50
weighted avg       0.64      0.64      0.64        50



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Word2Vec

In [14]:
import torch
import torch.utils.data.dataloader as dataloader
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
import numpy as np
from collections import Counter

In [15]:
# Build Vocabulary

raw_text = []
for x in train_remove_sw:
    raw_text.extend(x.split(' '))
raw_text = [x for x in raw_text if x != '']

vocab = set(raw_text)
vocab.add('unk')
vocab_size = len(vocab)
freqs = Counter(raw_text)

In [18]:
#Build data

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['stuff', 'going', 'mj', 'ive'], 'moment'), (['going', 'moment', 'ive', 'started'], 'mj'), (['moment', 'mj', 'started', 'listening'], 'ive'), (['mj', 'ive', 'listening', 'music'], 'started'), (['ive', 'started', 'music', 'watching'], 'listening')]


In [19]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(len(inputs), -1)
        out = self.linear1(embeds) #F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return(log_probs)

In [20]:
CONTEXT_SIZE = 2
batch_size = 10
device = torch.device('cuda:0')
losses = []
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, embedding_dim=200,
             context_size=CONTEXT_SIZE*2)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
data_iter = torch.utils.data.DataLoader(data, batch_size=batch_size,
                                        shuffle=False, num_workers=4)

In [22]:
for epoch in range(5):
    total_loss = torch.Tensor([0])
    num = 0
    for context, target in tqdm(data_iter):
        context_ids = []
        num += 1
        for i in range(len(context[0])):
            context_ids.append(make_context_vector([context[j][i] for j in range(len(context))], word_to_ix))
        context_ids = torch.stack(context_ids)
        context_ids = context_ids.to(device)
        model.zero_grad()
        log_probs = model(context_ids)
        label = make_context_vector(target, word_to_ix)
        label = label.to(device)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('epoch %d loss %.4f' %(epoch, total_loss / num))

100%|██████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:19<00:00, 55.72it/s]


epoch 0 loss 8.4800


100%|██████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:15<00:00, 68.96it/s]


epoch 1 loss 5.6650


100%|██████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:15<00:00, 69.60it/s]


epoch 2 loss 1.4831


 89%|██████████████████████████████████████████████████████████████████████▍        | 963/1080 [00:14<00:01, 96.77it/s]

KeyboardInterrupt: 

# Faseterrrr- negative sampling

In [23]:
import torch
import torch.utils.data.dataloader as dataloader
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
import numpy as np
from collections import Counter

In [24]:
raw_text = []
for x in train_remove_sw:
    raw_text.extend(x.split(' '))
raw_text = [x for x in raw_text if x != '']

In [25]:
vocab = set(raw_text)
vocab.add('unk')
vocab_size = len(vocab)
freqs = Counter(raw_text)

In [26]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['stuff', 'going', 'mj', 'ive'], 'moment'), (['going', 'moment', 'ive', 'started'], 'mj'), (['moment', 'mj', 'started', 'listening'], 'ive'), (['mj', 'ive', 'listening', 'music'], 'started'), (['ive', 'started', 'music', 'watching'], 'listening')]


In [27]:
freqs_pow = torch.Tensor([freqs[ix_to_word[i]] for i in range(vocab_size)]).pow(0.75)
dist = freqs_pow / freqs_pow.sum()

In [28]:
def neg_sample(num_samples, positives=[]):
    w = np.random.choice(len(dist), (len(positives), num_samples), p=dist.numpy())
    if positives.is_cuda:
        return torch.tensor(w).to(device)
    else:
        return torch.tensor(w)

In [29]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight.data.uniform_(-0.5 / vocab_size, 0.5 / vocab_size)
    def forward(self, inputs, label):
        negs = neg_sample(5, label)
        u_embeds = self.embeddings(label)
        v_embeds_pos = self.embeddings(inputs)
        v_embeds_neg = self.embeddings(negs)
        log_pos = torch.bmm(v_embeds_pos, u_embeds.unsqueeze(2)).squeeze(2)
        log_neg = torch.bmm(v_embeds_neg, -u_embeds.unsqueeze(2)).squeeze(2)
        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)
       
        loss = log_pos + log_neg
        
        return -loss

In [30]:
batch_size = 10
device = torch.device('cuda:0')
losses = []
model = CBOW(vocab_size, embedding_dim=200)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [31]:
data_iter = torch.utils.data.DataLoader(data, batch_size=batch_size,
                                        shuffle=False, num_workers=4)

In [32]:
for epoch in range(20):
    total_loss = torch.Tensor([0])
    num = 0
    for context, target in tqdm(data_iter):
        num += 1
        context_ids = []
        for i in range(len(context[0])):
            context_ids.append(make_context_vector([context[j][i] for j in range(len(context))], word_to_ix))
        context_ids = torch.stack(context_ids)
        context_ids = context_ids.to(device)
        model.zero_grad()
        label = make_context_vector(target, word_to_ix)
        label = label.to(device)
        loss = model(context_ids, label).mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('epoch %d loss %.4f' %(epoch+1, total_loss / num))


  0%|                                                                                         | 0/1080 [00:00<?, ?it/s]
  0%|                                                                                 | 1/1080 [00:02<53:03,  2.95s/it]
  1%|▊                                                                               | 11/1080 [00:03<36:51,  2.07s/it]
  2%|█▊                                                                              | 24/1080 [00:03<25:31,  1.45s/it]
  4%|██▉                                                                             | 39/1080 [00:03<17:39,  1.02s/it]
  5%|████                                                                            | 55/1080 [00:03<12:11,  1.40it/s]
  6%|█████▏                                                                          | 70/1080 [00:03<08:26,  1.99it/s]
  8%|██████▎                                                                         | 86/1080 [00:03<05:51,  2.83it/s]
  9%|███████▍                          

epoch 1 loss 6.2334



  0%|                                                                                         | 0/1080 [00:00<?, ?it/s]
  0%|                                                                               | 1/1080 [00:04<1:18:21,  4.36s/it]
  1%|▊                                                                               | 11/1080 [00:04<54:23,  3.05s/it]
  2%|█▍                                                                              | 20/1080 [00:04<37:49,  2.14s/it]
  3%|██▏                                                                             | 30/1080 [00:04<26:16,  1.50s/it]
  4%|██▉                                                                             | 40/1080 [00:04<18:16,  1.05s/it]
  5%|███▋                                                                            | 50/1080 [00:04<12:43,  1.35it/s]
  6%|████▍                                                                           | 60/1080 [00:04<08:52,  1.92it/s]
  6%|█████▏                            

KeyboardInterrupt: 

In [13]:
train = []
for s in tqdm(train_remove_sw):
    tmp = []
    for w in s.split():
        if w in vocab:
            word = torch.LongTensor([word_to_ix[w]]).cuda()
            emb = model.embeddings(word)
            tmp.append(emb)
    train.append(np.array(tmp).sum())
    
train_vec = torch.cat(train, 0)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [06:45<00:00, 24.66it/s]


In [15]:
test = []
for s in tqdm(test_remove_sw):
    tmp = []
    for w in s.split():
        if w in vocab:
            word = torch.LongTensor([word_to_ix[w]]).cuda()
            emb = model.embeddings(word)
            tmp.append(emb)
    test.append(np.array(tmp).sum())
    
test_vec = torch.cat(test, 0)

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:08<00:00, 29.16it/s]


In [18]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(train_vec.data.cpu().numpy(), np.array(train_target))
predictions = clf.predict(test_vec.data.cpu().numpy())

from sklearn.metrics import classification_report

print(classification_report(np.array(test_target), predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.69      0.71      0.70       978
           1       0.72      0.69      0.70      1022

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000



# More FasterRRRRRRRRRRR

In [33]:
with open('./IMDB_data/for_wv_ho.txt','w',encoding='utf8') as f:
    for i in train_remove_sw:
        f.write(i+'\n')

In [34]:
import logging
import gensim
from gensim.models import word2vec

logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

sentences = word2vec.LineSentence('./IMDB_data/for_wv_ho.txt')

model = gensim.models.Word2Vec(sentences, size = 200, sg = 0, iter = 20, window = 2, min_count = 1, hs = 0, negative = 5, ns_exponent = 0.75)  
model.wv.save_word2vec_format("./models/for_ho_word2Vec" + ".bin", binary = True) 

2020-07-13 16:31:44,899 : INFO : collecting all words and their counts
2020-07-13 16:31:44,915 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-13 16:31:44,915 : INFO : collected 4703 word types from a corpus of 10802 raw words and 100 sentences
2020-07-13 16:31:44,923 : INFO : Loading a fresh vocabulary
2020-07-13 16:31:44,931 : INFO : effective_min_count=1 retains 4703 unique words (100% of original 4703, drops 0)
2020-07-13 16:31:44,931 : INFO : effective_min_count=1 leaves 10802 word corpus (100% of original 10802, drops 0)
2020-07-13 16:31:44,955 : INFO : deleting the raw counts dictionary of 4703 items
2020-07-13 16:31:44,963 : INFO : sample=0.001 downsamples 18 most-common words
2020-07-13 16:31:44,963 : INFO : downsampling leaves estimated 10426 word corpus (96.5% of prior 10802)
2020-07-13 16:31:44,987 : INFO : estimated required memory for 4703 words and 200 dimensions: 9876300 bytes
2020-07-13 16:31:44,987 : INFO : resetting layer weights
20

In [21]:
train_set = []
for i in train_remove_sw:
    tmp = []
    for j in i.split():
        if j in model.wv.vocab:
            tmp.append(model.wv.get_vector(j))
    train_set.append(np.array(tmp).sum(0).reshape(1,-1))
    
train = torch.cat(train_set, 0)

In [22]:
test_set = []
for i in test_remove_sw:
    tmp = []
    for j in i.split():
        if j in model.wv.vocab:
            tmp.append(model.wv.get_vector(j))
    test_set.append(np.array(tmp).sum(0).reshape(1,-1))
    
test = torch.cat(test_set, 0)

In [25]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(train, np.array(train_target))
predictions = clf.predict(test)

from sklearn.metrics import classification_report

print(classification_report(np.array(test_target), predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.85      0.85      0.85       978
           1       0.86      0.86      0.86      1022

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



# GloVe
## Download GloVe weights:https://nlp.stanford.edu/projects/glove/

In [45]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = './IMDB_data/glove.6b/glove.6B.100d.txt'
word2vec_output_file = './IMDB_data/glove.6B.100d.word2vec.txt'
(count, dimensions) = glove2word2vec(glove_input_file, word2vec_output_file)
print(count, '\n', dimensions)

2020-07-03 12:03:25,865 : INFO : converting 400000 vectors from ./IMDB_data/glove.6b/glove.6B.100d.txt to ./IMDB_data/glove.6B.100d.word2vec.txt


400000 
 100


In [46]:
from gensim.models import KeyedVectors

glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

glove_model.save_word2vec_format('./IMDB_data/word2vec.6B.100d.bin.gz', binary=True)

2020-07-03 12:03:27,866 : INFO : loading projection weights from ./IMDB_data/glove.6B.100d.word2vec.txt
2020-07-03 12:04:08,960 : INFO : loaded (400000, 100) matrix from ./IMDB_data/glove.6B.100d.word2vec.txt
2020-07-03 12:04:08,962 : INFO : storing 400000x100 projection weights into ./IMDB_data/word2vec.6B.100d.bin.gz


In [47]:
g_wordVec = KeyedVectors.load_word2vec_format("./IMDB_data/word2vec.6B.100d.bin.gz", binary=True)

2020-07-03 12:04:19,458 : INFO : loading projection weights from ./IMDB_data/word2vec.6B.100d.bin.gz
2020-07-03 12:04:32,274 : INFO : loaded (400000, 100) matrix from ./IMDB_data/word2vec.6B.100d.bin.gz


In [50]:
len(g_wordVec.wv.vocab)

  """Entry point for launching an IPython kernel.


400000

In [51]:
g_train_set = []
for i in train_remove_sw:
    tmp = []
    for j in i.split():
        if j in g_wordVec.wv.vocab:
            tmp.append(g_wordVec.wv.get_vector(j))
    g_train_set.append(tmp)
    
g_test_set = []
for i in test_remove_sw:
    tmp = []
    for j in i.split():
        if j in g_wordVec.wv.vocab:
            tmp.append(g_wordVec.wv.get_vector(j))
    g_test_set.append(tmp)

  """
  
  del sys.path[0]
  


In [52]:
g_train = None
for i in range(len(g_train_set)):
    if i == 0:
        g_train = np.array(g_train_set[i]).mean(0).reshape(1,-1)
    else:
        g_train = np.concatenate((g_train, np.array(g_train_set[i]).mean(0).reshape(1,-1)), 0)
        
g_test = None
for i in range(len(g_test_set)):
    if i == 0:
        g_test = np.array(g_test_set[i]).mean(0).reshape(1,-1)
    else:
        g_test = np.concatenate((g_test, np.array(g_test_set[i]).mean(0).reshape(1,-1)), 0)

In [53]:
clf = LogisticRegression(random_state=0).fit(g_train, np.array(train_target))
predictions=clf.predict(g_test)

print(classification_report(np.array(test_target), predictions))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80      2566
           1       0.78      0.79      0.79      2434

    accuracy                           0.79      5000
   macro avg       0.79      0.79      0.79      5000
weighted avg       0.79      0.79      0.79      5000

