# Preparing data
## Data download:
https://drive.google.com/file/d/1zvc4_mKBpEhFWVju91KRIM0uQc6kR-S0/view?usp=sharing

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

data=pd.read_csv('./IMDB_data/labeledTrain.csv')

In [2]:
train_data=data['review'].tolist()[:100]
train_target=data['sentiment'].tolist()[:100]

test_data=data['review'].tolist()[100:150]
test_target=data['sentiment'].tolist()[100:150]

In [3]:
from nltk.corpus import stopwords
sw = stopwords.words("english")
stopWordDict = dict(zip(sw, list(range(len(sw)))))

In [4]:
train_remove_sw=[]
for review in tqdm(train_data):
    temp=''
    for word in review.split():
        if word not in stopWordDict:
            temp+=word+' '
    train_remove_sw.append(temp.strip())
    
test_remove_sw=[]
for review in tqdm(test_data):
    temp=''
    for word in review.split():
        if word not in stopWordDict:
            temp+=word+' '
    test_remove_sw.append(temp.strip())

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 12456.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<?, ?it/s]


# One-hot encoding

In [5]:
import numpy as np

token_index={}

for sample in tqdm(train_remove_sw):
    for word in sample.split():
        if word not in token_index:
            token_index[word]=len(token_index)+ 1

max_length=0
for s in train_remove_sw:
    if len(s)>max_length:
        max_length=len(s)

train_results=np.zeros((len(train_remove_sw),max_length,max(token_index.values())+1))
for i,smaple in tqdm(enumerate(train_remove_sw)):
    for j,word in list(enumerate(sample.split())):
        train_index=token_index.get(word)
        train_results[i,j,train_index]=1

100%|████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<?, ?it/s]
100it [00:00, 12427.20it/s]


In [6]:
train_results.shape

(100, 3938, 4704)

In [7]:
test_results=np.zeros((len(test_remove_sw),max_length,max(token_index.values())+1))
for i,smaple in tqdm(enumerate(test_remove_sw)):
    for j,word in list(enumerate(sample.split())):
        test_index=token_index.get(word)
        test_results[i,j,test_index]=1

50it [00:00, ?it/s]


In [8]:
test_results.shape

(50, 3938, 4704)

In [9]:
X_train=train_results.mean(1)
y_train=train_target

X_test=test_results.mean(1)
y_test=test_target

In [10]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions=clf.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.56      1.00      0.72        28
           1       0.00      0.00      0.00        22

    accuracy                           0.56        50
   macro avg       0.28      0.50      0.36        50
weighted avg       0.31      0.56      0.40        50



# TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train=tfidf_vectorizer.fit_transform(train_remove_sw)
X_test=tfidf_vectorizer.transform(test_remove_sw)

In [12]:
X_train.shape

(100, 4550)

In [13]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions=clf.predict(X_test)

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.59      0.96      0.73        28
           1       0.75      0.14      0.23        22

    accuracy                           0.60        50
   macro avg       0.67      0.55      0.48        50
weighted avg       0.66      0.60      0.51        50



# N-gram

In [14]:
CONTEXT_SIZE = 1
EMBEDDING_DIM = 64

In [15]:
bigram=[]
for sentence in train_remove_sw:
    tmp=sentence.split()
    for i in range(len(tmp)-1):
        bigram.append((tmp[i],tmp[i+1]))

In [16]:
bigram[1]

('going', 'moment')

In [17]:
words=[]
for s in train_remove_sw:
    tmp=s.split()
    for w in tmp:
        words.append(w)

In [18]:
vocb = set(words)
vocb.add('unk')
word_to_idx = {word: i for i, word in enumerate(vocb)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

In [19]:
word_to_idx

{'crazy': 0,
 'disneys': 1,
 'jealousy': 2,
 'atrocious;': 3,
 'location': 4,
 'merchant': 5,
 'couldve': 6,
 'paranoid!': 7,
 'weekend': 8,
 'said': 9,
 '1980': 10,
 'veronica': 11,
 'position': 12,
 'berserk': 13,
 'earhole': 14,
 'annuls': 15,
 'water': 16,
 'eats': 17,
 'showed': 18,
 'fare': 19,
 'starting': 20,
 'question:': 21,
 '--': 22,
 'sad': 23,
 'using': 24,
 'post-production': 25,
 'overly': 26,
 'sing': 27,
 'middle-aged': 28,
 'hated': 29,
 'props': 30,
 'join': 31,
 'shake': 32,
 'intentionally': 33,
 'rule': 34,
 'crisp': 35,
 'dracula': 36,
 'drags': 37,
 'sun': 38,
 'pants': 39,
 'rudyard': 40,
 'coat': 41,
 'lacklustre': 42,
 'boss': 43,
 'horrible': 44,
 'wait': 45,
 'fight': 46,
 'police': 47,
 'theres': 48,
 'transforms': 49,
 'satisfying': 50,
 'waiting': 51,
 'splat': 52,
 'centers': 53,
 'low-budget': 54,
 'skip': 55,
 'cliff': 56,
 '1930s': 57,
 'pace': 58,
 'viewed': 59,
 'historical': 60,
 'reference': 61,
 'decide': 62,
 'yorker': 63,
 'meantime': 64,
 'p

In [20]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

class n_gram(nn.Module):
    def __init__(self, vocab_size, context_size=CONTEXT_SIZE, n_dim=EMBEDDING_DIM):
        super(n_gram, self).__init__()

        self.embed = nn.Embedding(vocab_size, n_dim)
        self.classify = nn.Sequential(
            nn.Linear(context_size * n_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, vocab_size)
        )

    def forward(self, x):
        voc_embed = self.embed(x)
        voc_embed2 = voc_embed.view(1, -1)
        out = self.classify(voc_embed2)
        return out

In [21]:
net = n_gram(len(word_to_idx))
net=net.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-4)

In [22]:
for e in range(20):
    net.train()
    train_loss = 0
    for word, label in tqdm(bigram):
        word = Variable(torch.LongTensor([word_to_idx[word]])).cuda()
        label = Variable(torch.LongTensor([word_to_idx[label]])).cuda()
   
        out = net(word)
        loss = criterion(out, label)
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if (e + 1) % 5 == 0:
        print('epoch: {}, Loss: {:.6f}'.format(e + 1, train_loss / len(bigram)))

100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:13<00:00, 144.81it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:36<00:00, 103.18it/s]
100%|████████████████████████████████████████████████████████████████████████████| 10702/10702 [01:50<00:00, 97.20it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:29<00:00, 119.54it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:30<00:00, 113.26it/s]


epoch: 5, Loss: 7.523582


100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:28<00:00, 120.90it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:30<00:00, 118.79it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:33<00:00, 114.84it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:33<00:00, 114.00it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:36<00:00, 111.43it/s]


epoch: 10, Loss: 6.790649


100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:32<00:00, 115.64it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:33<00:00, 115.07it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:31<00:00, 116.35it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:33<00:00, 113.96it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:32<00:00, 116.29it/s]


epoch: 15, Loss: 6.561274


100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:30<00:00, 117.69it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:31<00:00, 117.18it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:32<00:00, 116.09it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:31<00:00, 116.78it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10702/10702 [01:31<00:00, 117.24it/s]


epoch: 20, Loss: 6.521470


In [23]:
net = net.eval()

In [24]:
word, label = bigram[10]
print('input: {}'.format(word))
print('label: {}'.format(label))
print()
word = Variable(torch.LongTensor([word_to_idx[word]])).cuda()
out = net(word)
pred_label_idx = out.max(1)[1].item()
predict_word = idx_to_word[pred_label_idx]
print('real word is {}, predicted word is {}'.format(label, predict_word))

input: documentary
label: watched

real word is watched, predicted word is style


In [25]:
net.embed(word)

tensor([[-0.0526, -0.0787,  0.0455, -0.0671,  0.0538,  0.0146, -0.0098,  0.0116,
          0.0437,  0.0463, -0.0784, -0.0542, -0.0388, -0.0337, -0.0103,  0.0394,
         -0.0650, -0.0013,  0.0034, -0.0095, -0.0908,  0.0336, -0.0818, -0.0900,
         -0.0786,  0.0189,  0.0776,  0.0805,  0.0629,  0.0735,  0.0325, -0.0300,
         -0.0589, -0.0146,  0.0438, -0.0028,  0.0426,  0.0182,  0.0225,  0.0079,
         -0.0603, -0.0398,  0.0456,  0.0440, -0.0154, -0.0675, -0.0255, -0.0791,
         -0.0368, -0.0046, -0.0411, -0.0281,  0.0576,  0.0551,  0.1262, -0.0151,
          0.0035, -0.0096, -0.0756, -0.0364,  0.0558,  0.0564,  0.0625,  0.0292]],
       device='cuda:0', grad_fn=<EmbeddingBackward>)

In [60]:
train = []
for s in train_remove_sw:
    tmp = []
    for w in s.split():
        if w not in vocb:
            word = Variable(torch.LongTensor([word_to_idx['unk']])).cuda()
            emb = net.embed(word)
            tmp.append(emb)
        else:
            word = Variable(torch.LongTensor([word_to_idx[w]])).cuda()
            emb = net.embed(word)
            tmp.append(emb)
    train.append(np.array(tmp).sum())

In [61]:
train_vec = None
for i in range(len(train)):
    if i == 0:
        train_vec = train[i]
    else:
        train_vec = torch.cat((train_vec, train[i]), 0)

In [62]:
test = []
for s in test_remove_sw:
    tmp = []
    for w in s.split():
        if w not in vocb:
            word = Variable(torch.LongTensor([word_to_idx['unk']])).cuda()
            emb = net.embed(word)
            tmp.append(emb)
        else:
            word = Variable(torch.LongTensor([word_to_idx[w]])).cuda()
            emb = net.embed(word)
            tmp.append(emb)
    test.append(np.array(tmp).sum())

In [63]:
test_vec = None
for i in range(len(test)):
    if i == 0:
        test_vec = test[i]
    else:
        test_vec = torch.cat((test_vec, test[i]), 0)

In [88]:
train_vec.shape

torch.Size([100, 64])

In [64]:
clf = LogisticRegression(random_state=0).fit(train_vec.data.cpu().numpy(), y_train)
predictions=clf.predict(test_vec.data.cpu().numpy())

print(classification_report(y_test, predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.76      0.46      0.58        28
           1       0.55      0.82      0.65        22

    accuracy                           0.62        50
   macro avg       0.66      0.64      0.62        50
weighted avg       0.67      0.62      0.61        50



# Word2Vec

In [14]:
import torch
import torch.utils.data.dataloader as dataloader
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
import numpy as np
from collections import Counter

In [15]:
raw_text = []
for x in train_remove_sw:
    raw_text.extend(x.split(' '))
raw_text = [x for x in raw_text if x != '']

In [17]:
vocab = set(raw_text)
vocab.add('unk')
vocab_size = len(vocab)
freqs = Counter(raw_text)

In [18]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['stuff', 'going', 'mj', 'ive'], 'moment'), (['going', 'moment', 'ive', 'started'], 'mj'), (['moment', 'mj', 'started', 'listening'], 'ive'), (['mj', 'ive', 'listening', 'music'], 'started'), (['ive', 'started', 'music', 'watching'], 'listening')]


In [19]:
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(len(inputs), -1)
        out = self.linear1(embeds) #F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return(log_probs)

In [20]:
CONTEXT_SIZE = 2
batch_size = 10
device = torch.device('cuda:0')
losses = []
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, embedding_dim=200,
             context_size=CONTEXT_SIZE*2)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
data_iter = torch.utils.data.DataLoader(data, batch_size=batch_size,
                                        shuffle=False, num_workers=4)

In [22]:
for epoch in range(5):
    total_loss = torch.Tensor([0])
    num = 0
    for context, target in tqdm(data_iter):
        context_ids = []
        num += 1
        for i in range(len(context[0])):
            context_ids.append(make_context_vector([context[j][i] for j in range(len(context))], word_to_ix))
        context_ids = torch.stack(context_ids)
        context_ids = context_ids.to(device)
        model.zero_grad()
        log_probs = model(context_ids)
        label = make_context_vector(target, word_to_ix)
        label = label.to(device)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('epoch %d loss %.4f' %(epoch, total_loss / num))

100%|██████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:19<00:00, 55.72it/s]


epoch 0 loss 8.4800


100%|██████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:15<00:00, 68.96it/s]


epoch 1 loss 5.6650


100%|██████████████████████████████████████████████████████████████████████████████| 1080/1080 [00:15<00:00, 69.60it/s]


epoch 2 loss 1.4831


 89%|██████████████████████████████████████████████████████████████████████▍        | 963/1080 [00:14<00:01, 96.77it/s]

KeyboardInterrupt: 

# Faseterrrr- negative sampling

In [23]:
import torch
import torch.utils.data.dataloader as dataloader
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
import numpy as np
from collections import Counter

In [24]:
raw_text = []
for x in train_remove_sw:
    raw_text.extend(x.split(' '))
raw_text = [x for x in raw_text if x != '']

In [25]:
vocab = set(raw_text)
vocab.add('unk')
vocab_size = len(vocab)
freqs = Counter(raw_text)

In [26]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['stuff', 'going', 'mj', 'ive'], 'moment'), (['going', 'moment', 'ive', 'started'], 'mj'), (['moment', 'mj', 'started', 'listening'], 'ive'), (['mj', 'ive', 'listening', 'music'], 'started'), (['ive', 'started', 'music', 'watching'], 'listening')]


In [27]:
freqs_pow = torch.Tensor([freqs[ix_to_word[i]] for i in range(vocab_size)]).pow(0.75)
dist = freqs_pow / freqs_pow.sum()

In [28]:
def neg_sample(num_samples, positives=[]):
    w = np.random.choice(len(dist), (len(positives), num_samples), p=dist.numpy())
    if positives.is_cuda:
        return torch.tensor(w).to(device)
    else:
        return torch.tensor(w)

In [29]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight.data.uniform_(-0.5 / vocab_size, 0.5 / vocab_size)
    def forward(self, inputs, label):
        negs = neg_sample(5, label)
        u_embeds = self.embeddings(label)
        v_embeds_pos = self.embeddings(inputs)
        v_embeds_neg = self.embeddings(negs)
        log_pos = torch.bmm(v_embeds_pos, u_embeds.unsqueeze(2)).squeeze(2)
        log_neg = torch.bmm(v_embeds_neg, -u_embeds.unsqueeze(2)).squeeze(2)
        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)
       
        loss = log_pos + log_neg
        
        return -loss

In [30]:
batch_size = 10
device = torch.device('cuda:0')
losses = []
model = CBOW(vocab_size, embedding_dim=200)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [31]:
data_iter = torch.utils.data.DataLoader(data, batch_size=batch_size,
                                        shuffle=False, num_workers=4)

In [32]:
for epoch in range(20):
    total_loss = torch.Tensor([0])
    num = 0
    for context, target in tqdm(data_iter):
        num += 1
        context_ids = []
        for i in range(len(context[0])):
            context_ids.append(make_context_vector([context[j][i] for j in range(len(context))], word_to_ix))
        context_ids = torch.stack(context_ids)
        context_ids = context_ids.to(device)
        model.zero_grad()
        label = make_context_vector(target, word_to_ix)
        label = label.to(device)
        loss = model(context_ids, label).mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print('epoch %d loss %.4f' %(epoch+1, total_loss / num))


  0%|                                                                                         | 0/1080 [00:00<?, ?it/s]
  0%|                                                                                 | 1/1080 [00:02<53:03,  2.95s/it]
  1%|▊                                                                               | 11/1080 [00:03<36:51,  2.07s/it]
  2%|█▊                                                                              | 24/1080 [00:03<25:31,  1.45s/it]
  4%|██▉                                                                             | 39/1080 [00:03<17:39,  1.02s/it]
  5%|████                                                                            | 55/1080 [00:03<12:11,  1.40it/s]
  6%|█████▏                                                                          | 70/1080 [00:03<08:26,  1.99it/s]
  8%|██████▎                                                                         | 86/1080 [00:03<05:51,  2.83it/s]
  9%|███████▍                          

epoch 1 loss 6.2334



  0%|                                                                                         | 0/1080 [00:00<?, ?it/s]
  0%|                                                                               | 1/1080 [00:04<1:18:21,  4.36s/it]
  1%|▊                                                                               | 11/1080 [00:04<54:23,  3.05s/it]
  2%|█▍                                                                              | 20/1080 [00:04<37:49,  2.14s/it]
  3%|██▏                                                                             | 30/1080 [00:04<26:16,  1.50s/it]
  4%|██▉                                                                             | 40/1080 [00:04<18:16,  1.05s/it]
  5%|███▋                                                                            | 50/1080 [00:04<12:43,  1.35it/s]
  6%|████▍                                                                           | 60/1080 [00:04<08:52,  1.92it/s]
  6%|█████▏                            

KeyboardInterrupt: 

In [13]:
train = []
for s in tqdm(train_remove_sw):
    tmp = []
    for w in s.split():
        if w in vocab:
            word = torch.LongTensor([word_to_ix[w]]).cuda()
            emb = model.embeddings(word)
            tmp.append(emb)
    train.append(np.array(tmp).sum())

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [06:45<00:00, 24.66it/s]


In [14]:
train_vec = None
for i in tqdm(range(len(train))):
    if i == 0:
        train_vec = train[i]
    else:
        train_vec = torch.cat((train_vec, train[i]), 0)

100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 3075.33it/s]


In [15]:
test = []
for s in tqdm(test_remove_sw):
    tmp = []
    for w in s.split():
        if w in vocab:
            word = torch.LongTensor([word_to_ix[w]]).cuda()
            emb = model.embeddings(word)
            tmp.append(emb)
    test.append(np.array(tmp).sum())

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:08<00:00, 29.16it/s]


In [16]:
test_vec = None
for i in tqdm(range(len(test))):
    if i == 0:
        test_vec = test[i]
    else:
        test_vec = torch.cat((test_vec, test[i]), 0)

100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 4799.10it/s]


In [18]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train_vec.data.cpu().numpy(), np.array(train_target))
predictions=clf.predict(test_vec.data.cpu().numpy())
from sklearn.metrics import classification_report
print(classification_report(np.array(test_target), predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.69      0.71      0.70       978
           1       0.72      0.69      0.70      1022

    accuracy                           0.70      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.70      0.70      0.70      2000



# More FasterRRRRRRRRRRR

In [33]:
with open('./IMDB_data/for_wv_ho.txt','w',encoding='utf8') as f:
    for i in train_remove_sw:
        f.write(i+'\n')

In [34]:
import logging
import gensim
from gensim.models import word2vec

logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)

sentences = word2vec.LineSentence('./IMDB_data/for_wv_ho.txt')

model = gensim.models.Word2Vec(sentences, size = 200, sg = 0, iter = 20, window = 2, min_count = 1, hs = 0, negative = 5, ns_exponent = 0.75)  
model.wv.save_word2vec_format("./models/for_ho_word2Vec" + ".bin", binary = True) 

2020-07-13 16:31:44,899 : INFO : collecting all words and their counts
2020-07-13 16:31:44,915 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-13 16:31:44,915 : INFO : collected 4703 word types from a corpus of 10802 raw words and 100 sentences
2020-07-13 16:31:44,923 : INFO : Loading a fresh vocabulary
2020-07-13 16:31:44,931 : INFO : effective_min_count=1 retains 4703 unique words (100% of original 4703, drops 0)
2020-07-13 16:31:44,931 : INFO : effective_min_count=1 leaves 10802 word corpus (100% of original 10802, drops 0)
2020-07-13 16:31:44,955 : INFO : deleting the raw counts dictionary of 4703 items
2020-07-13 16:31:44,963 : INFO : sample=0.001 downsamples 18 most-common words
2020-07-13 16:31:44,963 : INFO : downsampling leaves estimated 10426 word corpus (96.5% of prior 10802)
2020-07-13 16:31:44,987 : INFO : estimated required memory for 4703 words and 200 dimensions: 9876300 bytes
2020-07-13 16:31:44,987 : INFO : resetting layer weights
20

In [21]:
train_set = []
for i in train_remove_sw:
    tmp = []
    for j in i.split():
        if j in model.wv.vocab:
            tmp.append(model.wv.get_vector(j))
    train_set.append(np.array(tmp).sum(0).reshape(1,-1))

In [22]:
test_set = []
for i in test_remove_sw:
    tmp = []
    for j in i.split():
        if j in model.wv.vocab:
            tmp.append(model.wv.get_vector(j))
    test_set.append(np.array(tmp).sum(0).reshape(1,-1))

In [23]:
train = None
for i in range(len(train_set)):
    if i == 0:
        train = train_set[i]
    else:
        train = np.concatenate((train, train_set[i]), 0)

In [24]:
test = None
for i in range(len(test_set)):
    if i == 0:
        test = test_set[i]
    else:
        test = np.concatenate((test, test_set[i]), 0)

In [25]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(train, np.array(train_target))
predictions=clf.predict(test)
from sklearn.metrics import classification_report
print(classification_report(np.array(test_target), predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.85      0.85      0.85       978
           1       0.86      0.86      0.86      1022

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



# GloVe
## Download GloVe weights:https://nlp.stanford.edu/projects/glove/

In [45]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = './IMDB_data/glove.6b/glove.6B.100d.txt'
word2vec_output_file = './IMDB_data/glove.6B.100d.word2vec.txt'
(count, dimensions) = glove2word2vec(glove_input_file, word2vec_output_file)
print(count, '\n', dimensions)

2020-07-03 12:03:25,865 : INFO : converting 400000 vectors from ./IMDB_data/glove.6b/glove.6B.100d.txt to ./IMDB_data/glove.6B.100d.word2vec.txt


400000 
 100


In [46]:
from gensim.models import KeyedVectors

glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

glove_model.save_word2vec_format('./IMDB_data/word2vec.6B.100d.bin.gz', binary=True)

2020-07-03 12:03:27,866 : INFO : loading projection weights from ./IMDB_data/glove.6B.100d.word2vec.txt
2020-07-03 12:04:08,960 : INFO : loaded (400000, 100) matrix from ./IMDB_data/glove.6B.100d.word2vec.txt
2020-07-03 12:04:08,962 : INFO : storing 400000x100 projection weights into ./IMDB_data/word2vec.6B.100d.bin.gz


In [47]:
g_wordVec = KeyedVectors.load_word2vec_format("./IMDB_data/word2vec.6B.100d.bin.gz", binary=True)

2020-07-03 12:04:19,458 : INFO : loading projection weights from ./IMDB_data/word2vec.6B.100d.bin.gz
2020-07-03 12:04:32,274 : INFO : loaded (400000, 100) matrix from ./IMDB_data/word2vec.6B.100d.bin.gz


In [50]:
len(g_wordVec.wv.vocab)

  """Entry point for launching an IPython kernel.


400000

In [51]:
g_train_set = []
for i in train_remove_sw:
    tmp = []
    for j in i.split():
        if j in g_wordVec.wv.vocab:
            tmp.append(g_wordVec.wv.get_vector(j))
    g_train_set.append(tmp)
    
g_test_set = []
for i in test_remove_sw:
    tmp = []
    for j in i.split():
        if j in g_wordVec.wv.vocab:
            tmp.append(g_wordVec.wv.get_vector(j))
    g_test_set.append(tmp)

  """
  
  del sys.path[0]
  


In [52]:
g_train = None
for i in range(len(g_train_set)):
    if i == 0:
        g_train = np.array(g_train_set[i]).mean(0).reshape(1,-1)
    else:
        g_train = np.concatenate((g_train, np.array(g_train_set[i]).mean(0).reshape(1,-1)), 0)
        
g_test = None
for i in range(len(g_test_set)):
    if i == 0:
        g_test = np.array(g_test_set[i]).mean(0).reshape(1,-1)
    else:
        g_test = np.concatenate((g_test, np.array(g_test_set[i]).mean(0).reshape(1,-1)), 0)

In [53]:
clf = LogisticRegression(random_state=0).fit(g_train, np.array(train_target))
predictions=clf.predict(g_test)

print(classification_report(np.array(test_target), predictions))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80      2566
           1       0.78      0.79      0.79      2434

    accuracy                           0.79      5000
   macro avg       0.79      0.79      0.79      5000
weighted avg       0.79      0.79      0.79      5000

