# Load Lexicon and pre-trained models

In [1]:
from os import path
filepath = "F:\\Datasets\\Embeddings"
en_embedding_filename = "GoogleNews-vectors-negative300.bin"
bn_embedding_filename = "word2vec_dim300_win5_iter35.model"
lexicon_filename = "en-bn.txt"

en_embedding_path = path.join(filepath, en_embedding_filename)
bn_embedding_path = path.join(filepath, bn_embedding_filename)
lexicon_path = path.join(filepath, lexicon_filename)

print(en_embedding_path)
print(bn_embedding_path)
print(lexicon_path)

F:\Datasets\Embeddings\GoogleNews-vectors-negative300.bin
F:\Datasets\Embeddings\word2vec_dim300_win5_iter35.model
F:\Datasets\Embeddings\en-bn.txt


In [2]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

#Load models
en_model = KeyedVectors.load_word2vec_format(en_embedding_path, binary=True)
bn_model = Word2Vec.load(bn_embedding_path)

#Check if Models are loaded successfully
try:
    en_model.vocab["school"]
except:
    print("EN: Word not fond.")
else:
    print("EN: Loaded Successlly.")
    
try:
    bn_model.wv.vocab["আমি"]
except:
    print("BN: Word not fond.") 
else:
    print("BN: Loaded Successlly.")

EN: Loaded Successlly.
BN: Loaded Successlly.


In [3]:
#Check for English in the Translated section
import re
pattern = r"[a-zA-Z]*"

lexicon = []
with open(lexicon_path, "r", encoding="utf-8") as fr:
    for row in fr:
#         row = row.replace("#", "")
#         row = row.replace("}}", "")
        row = row.replace("\n", "")
        row = row.split()
        
        if re.match(pattern, row[1]).group(0):
            continue
        lexicon.append(row)
len(lexicon), lexicon[0], lexicon[-1] 

(23485, ['the', 'করে'], ['cyaxares', 'সিয়াক্সেরিস'])

In [4]:
#Helper function for creating embeddings with Lexicon-words
def get_embedding(en_word, bn_word):
    try:
        en_model.vocab[en_word]
        bn_model.wv.vocab[bn_word]
    except:
        return []
    else:
        return (en_model[en_word], bn_model.wv[bn_word])

In [5]:
#Create 2 different embeddings for Bangla and English
#that contain only the words available in the lexicon
import numpy as np
en_embedding_D = []
bn_embedding_D = []

for pair in lexicon:
    try:
        en,bn = get_embedding(pair[0], pair[1])
    except:
        continue
    else:
        en_embedding_D.append(en)
        bn_embedding_D.append(bn)

en_embedding_D = np.array(en_embedding_D)
bn_embedding_D = np.array(bn_embedding_D)
en_embedding_D.shape , bn_embedding_D.shape

((18822, 300), (18822, 300))

# Corpus Transformation

In [6]:
#Calculae SVD of the Lexicon-embeddings
from numpy.linalg import svd
from numpy import transpose as T
from numpy import dot

# Y_D^T * X_D
# Y_D = English Embeddings ; X_D = Bangla Embeddings
YDT_XD = en_embedding_D.T.dot(bn_embedding_D)

u, s, vh = np.linalg.svd(YDT_XD, full_matrices=True)
V = vh.T
print(u.shape, s.shape, V.shape)

(300, 300) (300,) (300, 300)


In [7]:
len(en_model.vocab), len(bn_model.wv.vocab) 

(3000000, 1337032)

In [7]:
#Convert gensim English model to numpy array
en_vocab_size = 3000000
i = 0

Y = np.zeros((en_vocab_size,300),dtype='float32')

for k,v in en_model.vocab.items():
    Y[i] = en_model[k]
    i += 1

In [8]:
#Convert gensim Bangla model to numpy array
bn_vocab_size = 1337032
i = 0

X = np.zeros((bn_vocab_size,300),dtype='float32')

for k,v in bn_model.wv.vocab.items():
    X[i] = bn_model.wv[k]
    i += 1

In [14]:
v.T

AttributeError: 'Vocab' object has no attribute 'T'

In [9]:
#Transform our Embeddings on both languages with u and vh
trans_bn_emb = X.dot(u)
trans_en_emb = Y.dot(v)

trans_bn_emb.shape, trans_en_emb.shape

TypeError: unsupported operand type(s) for *: 'float' and 'Vocab'

# Load English Sentiment Data - IMDB

In [11]:
from os import listdir
datapath = "F:\\Datasets\\Sentiment\\aclImdb\\train"
pos_path = path.join(datapath, "pos")
neg_path = path.join(datapath, "neg")
unsup_path = path.join(datapath, "unsup")

pos_files = listdir(pos_path)
neg_files = listdir(neg_path)
unsup_files = listdir(unsup_path)

In [12]:
pos_reviews = []
for pos_file in pos_files:
    pos = path.join(pos_path, pos_file)
    with open(pos, "r", encoding="utf-8") as fr:
        pos_reviews.append(fr.read())
len(pos_reviews)

12500

In [13]:
print(pos_reviews[0])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [13]:
neg_reviews = []
for neg_file in neg_files:
    neg = path.join(neg_path, neg_file)
    with open(neg, "r", encoding="utf-8") as fr:
        neg_reviews.append(fr.read())
len(neg_reviews)

12500

In [15]:
print(neg_reviews[0])

Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [16]:
unsup_reviews = []
for unsup_file in unsup_files:
    unsup = path.join(unsup_path, unsup_file)
    with open(unsup, "r", encoding="utf-8") as fr:
        unsup_reviews.append(fr.read())
len(unsup_reviews)

50000

In [16]:
print(unsup_reviews[0])

I admit, the great majority of films released before say 1933 are just not for me. Of the dozen or so "major" silents I have viewed, one I loved (The Crowd), and two were very good (The Last Command and City Lights, that latter Chaplin circa 1931).<br /><br />So I was apprehensive about this one, and humor is often difficult to appreciate (uh, enjoy) decades later. I did like the lead actors, but thought little of the film.<br /><br />One intriguing sequence. Early on, the guys are supposed to get "de-loused" and for about three minutes, fully dressed, do some schtick. In the background, perhaps three dozen men pass by, all naked, white and black (WWI ?), and for most, their butts, part or full backside, are shown. Was this an early variation of beefcake courtesy of Howard Hughes?


# Translate IMDB dataset English -> Bangla

In [22]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\akibs\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
from scipy import spatial

def find_bn_equivalent_embedding(word, en_model, trans_en_emb, trans_bn_emb):
    try:
        en_emb = trans_en_emb[en_model.vocab[word].index]
    except:
        return None, None
#     min_ind = 0
#     min_distance = 1
#     for i, bn_emb in enumerate(trans_bn_emb):
#         if spatial.distance.cosine(en_emb, bn_emb) < min_distance:
#             min_distance = spatial.distance.cosine(en_emb, bn_emb)
#             min_ind = i
#     return min_ind, trans_bn_emb[min_ind]

# def get_bn_word(ind, bn_model):
#     for key, val in bn_model.wv.vocab.items():
#         if val.index == ind:
#             return key

In [23]:
a, b = find_bn_equivalent_embedding("school", en_model, trans_en_emb, trans_bn_emb)
a

AttributeError: 'tuple' object has no attribute 'shape'

In [21]:
import nltk

tokenized_text = nltk.word_tokenize(pos_reviews[0])
for word in tokenized_text:
    i, bn_emb = find_bn_equivalent_embedding(word, en_model, trans_en_emb, trans_bn_emb)
    if i:
        bn_word = get_bn_word(i, bn_model)
        print(word, " - " ,bn_word)

KeyboardInterrupt: 

In [16]:
destination = "F:\\Datasets\\Sentiment\\aclImdb_bn"
en_model.vocab["a"]


KeyError: 'a'

en_model.vocab["school"].index