In [2]:
import warnings, numpy as np, re, json, pandas as pd, pickle, unicodedata, textblob
# try:
#     import gnumpy as gpu
# except ModuleNotFoundError:
#     pass
from TurkishStemmer import TurkishStemmer
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim, math
from gensim.models import doc2vec
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
# from KaggleWord2VecUtility import KaggleWord2VecUtility

In [3]:
df = pd.read_csv("datasets/movie_data.csv")
df.head(5)

Unnamed: 0,Language,Movie_ID,Review,Score
0,en,-800777728,i love science fiction and i hate superheroes ...,9
1,en,-800777728,the movie is absolutely incredible all the per...,10
2,en,-1018312192,in a cinematic era dominated by reboots and mi...,8
3,en,-1018312192,movie review on rise of the planet of the apes...,4
4,en,-1018312192,during experiments to find a cure for alzheime...,7


In [4]:
data_dict = dict() #{language:{score: {movie_id: [rev1, rev2, ..., revn]}}}

In [5]:
for row in df.iterrows():
    lang = row[1][0]
    movie_id = row[1][1]
    review = row[1][2]
    score = row[1][3]
    
    data_dict.setdefault(lang, {})
    data_dict[lang].setdefault(score, {})
    data_dict[lang][score].setdefault(movie_id, [])
    data_dict[lang][score][movie_id].append(review)

In [6]:
pickle.dump(data_dict, open("datasets/movie_reviews_dict.pckl","wb"))

In [7]:
data_dict = pickle.load(open("datasets/movie_reviews_dict.pckl","rb"))

In [None]:
en_vects = gensim.models.KeyedVectors.load_word2vec_format(r"GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
tr_vects = gensim.models.KeyedVectors.load_word2vec_format(r"wiki.tr/wiki.tr.vec", binary=False)

In [8]:
def learn_translation_matrix(X,Y, iterations=5000, alpha=0.0001, alpha_change_rate=0.8):
    W = np.random.random((300, 300))
    for i in range(iterations+1):
        gradient = np.zeros(300)
        for score in range(len(X)):
            error = X[score].dot(W) - Y[score]
            gradient += alpha * np.gradient(error)
        W += gradient
        if i == 2000:
            alpha /= 100

        if i%1000 == 0:
            alpha *= alpha_change_rate
            print("Mikolov distance: {}".format(mikolov(X, Y, W)))
    return W

In [9]:
def clean(text, language="en", stem=True):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').lower().decode("ascii")
    
    if language == "tr":
        if stem:
            text= ' '.join([self.turkish_stemmer.stem(w) for w in text.split()])
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r'[0-9]', '#', text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"e(\s)?-(\s)?mail", "email", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    return TextBlob(text)


In [10]:
d = {"tr":"tr2en", "en":"en2tr"}
en2tr = dict()
tr2en = dict()

In [11]:
def record_translations():
    for lang in data_dict:
        for score in data_dict[lang]:
            for movie in data_dict[lang][score]:
                for review in data_dict[lang][score][movie]:
                    try:
                        blob = clean(review)
                        if review in globals()[d[lang]]:
                            ent = globals()[d[lang]][review]
                        else:
                            ent = str(blob.translate(to=d[lang][-2:]))
                            globals()[d[lang]][review] = ent
                    except:
                        continue

In [12]:
record_translations()

In [13]:
pickle.dump(en2tr, open("datasets/en2tr","wb"))
pickle.dump(tr2en, open("datasets/tr2en","wb"))

In [14]:
W1 = np.random.random(300)
W2 = np.random.random(300)
W3 = np.random.random(300)

In [38]:
def train_W(X, Y, W, iter=100, alpha=0.1):
    for i in range(1, iter+1):  # We add this one so we can use i%10 == 0 in the last epoch
        delta = 0
        error = 0
        for j in range(len(X)):
            x = X[j]
            y = Y[j]
            y_prime = np.dot(x,W)
            err = np.linalg.norm(y_prime - y)
            error += err
            delta += np.gradient(err)
        if i %10 == 0:
            print ("Epoch %d:"%i, error)
        W += alpha * delta
    return W

In [36]:
X1 = dict()
Y1 = dict()

X2 = dict()
Y2 = dict()

X3 = dict()
Y3 = dict()

In [39]:
def vectorize(text, language):
    blob = clean(text, language)
    vector = np.zeros(self.vector_size)
    if len(blob.words) < 1:
        return None

    for word in blob.words:
        try:
            if language == "en":
                vector += globals()["en_w2v"][word]
            else:
                vector += globals()["tr_w2v"][word]
        except KeyError:
            continue
    vector /= len(blob.words)
    return vector

In [40]:
# for tr_doc in tr_docs:
#     en_translation = tr2en[tr_doc]
#     X1[len(X1)] = vectorize(tr_doc)
#     Y1[len(Y1)] = vectorize(en_translation)
# W1 = train_W(X1.values(), Y1.values(), W1)

for score in data_dict["tr"]:
    for movie in data_dict["tr"][score]:
        for tr_rev in data_dict["tr"][score][movie]:
            X2[len(X2)] = vectorize(tr_rev)
            try:
                for en_rev in data_dict["en"][score][movie]:
                    Y2[len(Y2)] = vectorize(en2tr[en_rev])
            except KeyError:
                continue
W2 = train_W(X2.values(), Y2.values(), W2)

# for en_doc in en_docs:
#     tr_translation = en2tr[en_doc]
#     X3[len(X3)] = vectorize(tr_translation)
#     Y3[len(Y3)] = vectorize(en_doc)
# W3 = train_W(X3.values(), Y3.values(), W3)