In [1]:
import pandas as pd, numpy as np, pickle, ast, unicodedata, re
from googletrans import Translator
from TurkishStemmer import TurkishStemmer
import gensim

In [2]:
df = pd.read_csv("datasets/tokenized_reviews.csv")
df.head(5)

Unnamed: 0,Language,Movie_ID,Review,Score,tokenized_reviews
0,en,-800777728,i love science fiction and i hate superheroes ...,9,"['love', 'science', 'fiction', 'hate', 'superh..."
1,en,-800777728,the movie is absolutely incredible all the per...,10,"['the', 'movie', 'absolutely', 'incredible', '..."
2,en,-1018312192,in a cinematic era dominated by reboots and mi...,8,"['cinematic', 'era', 'dominated', 'reboots', '..."
3,en,-1018312192,movie review on rise of the planet of the apes...,4,"['movie', 'review', 'rise', 'the', 'planet', '..."
4,en,-1018312192,during experiments to find a cure for alzheime...,7,"['during', 'experiments', 'find', 'cure', 'for..."


In [3]:
len(df)

1000

In [4]:
def clean(tweet):
    text = unicodedata.normalize('NFKD', tweet).encode('ascii', 'ignore').lower().decode("ascii")
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r'[0-9]', '#', text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"e - mail", "email", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)

    return text

In [5]:
en_vects = gensim.models.KeyedVectors.load_word2vec_format(r"GoogleNews-vectors-negative300.bin", binary=True)

In [6]:
tr_vects = gensim.models.KeyedVectors.load_word2vec_format(r"wiki.tr/wiki.tr.vec", binary=False)
# tr_vects = gensim.models.Word2Vec.load("wiki.tr/wiki.tr.bin")

In [12]:
en_word2cluster = pickle.load(open("datasets/en_word2cluster.pickle", "rb"))
tr_word2cluster = pickle.load(open("datasets/tr_word2cluster.pickle", "rb"))

In [7]:
translator = Translator()
stemmer = TurkishStemmer()

In [52]:
def vectorize(df, translate_freq=0.75):
    inv_langs = {"en":"tr", "tr":"en"}
    new_df = dict()
    for row in df.iterrows():
        src = row[1]["Language"]
        score = row[1]["Score"]
        trgt = inv_langs[src]
        tokens = clean(row[1]["Review"]).split()
        
        vector = np.zeros(300)
        clust_vector = np.zeros(300)
        
        for word in tokens:
            try:
                if np.random.random() >= translate_freq:
                    translated_word = clean(translator.translate(stemmer.stem(word), src=src, dest=trgt).text)
                    if trgt == 'tr':  ## English > Turkish
                        vector += globals()["tr_vects"][translated_word]
                        clust_vector += globals()["tr_word2cluster"][translated_word]
                    else:  ## Turkish > English
                        vector += globals()["en_vects"][translated_word]
                        clust_vector += globals()["en_word2cluster"][translated_word]
                elif src == 'tr':
                    vector += globals()["tr_vects"][stemmer.stem(word)]
                    clust_vector += globals()["tr_word2cluster"][stemmer.stem(word)]
                else:
                    vector += globals()["en_vects"][word]
                    clust_vector += globals()["en_word2cluster"][stemmer.stem(word)]
            except:  # Keyerror or JSONDecodeError
                continue
    new_df[row[0]] = [score, vector/len(tokens), clust_vector/len(tokens)]
    return pd.DataFrame.from_dict(new_df, 'index').rename({0:"Score", 1:"bow_word2vec", 2:"bow_clust2vec"}, axis=1)

In [None]:
bow_df = vectorize(df)

In [None]:
pd.to_csv("labeled_bow.csv", index_label="ID")

In [None]:
# def vectorize_clust(review, src, trgt):
#     global en_word2cluster, tr_word2cluster
#     tokens = clean(review).split()
#     vectors = np.zeros(300)
#     for word in tokens:
#         try:
#             if np.random.random() >= 0.5:
#                 if trgt == 'tr':  ## English > Turkish
#                     vectors += tr_word2cluster[clean(translator.translate(stemmer.stem(word), src=src, dest=trgt).text)]
#                 else:  ## Turkish > English
#                     vectors += en_word2cluster[clean(translator.translate(word, src=src, dest=trgt).text)]
#             elif src == 'tr':
#                 vectors += tr_word2cluster[stemmer.stem(word)]
#             else:
#                 vectors += en_word2cluster[word]
#         except:  # Keyerror or JSONDecodeError
#             continue
#     return vectors/len(tokens)