In [1]:
import warnings, numpy as np, re, json, pandas as pd, pickle, unicodedata, textblob
# try:
#     import gnumpy as gpu
# except ModuleNotFoundError:
#     pass
from TurkishStemmer import TurkishStemmer
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim, math
from gensim.models import doc2vec
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
# from KaggleWord2VecUtility import KaggleWord2VecUtility

In [2]:
df = pd.read_csv("datasets/movie_data.csv")
df.head(5)

Unnamed: 0,Language,Movie_ID,Review,Score
0,en,-800777728,i love science fiction and i hate superheroes ...,9
1,en,-800777728,the movie is absolutely incredible all the per...,10
2,en,-1018312192,in a cinematic era dominated by reboots and mi...,8
3,en,-1018312192,movie review on rise of the planet of the apes...,4
4,en,-1018312192,during experiments to find a cure for alzheime...,7


In [3]:
d = {"tr":"tr2en", "en":"en2tr"}
en2tr = dict()
tr2en = dict()

In [4]:
en_vects = gensim.models.KeyedVectors.load_word2vec_format(r"../NLP_data/GoogleNews-vectors-negative300.bin", binary=True)

In [5]:
tr_vects = gensim.models.KeyedVectors.load_word2vec_format(r"../NLP_data/wiki.tr/wiki.tr.vec", binary=False)

In [6]:
def learn_translation_matrix(X,Y, iterations=5000, alpha=0.0001, alpha_change_rate=0.8):
    W = np.random.random((300, 300))
    for i in range(iterations+1):
        gradient = np.zeros(300)
        for score in range(len(X)):
            error = X[score].dot(W) - Y[score]
            gradient += alpha * np.gradient(error)
        W += gradient
        if i == 2000:
            alpha /= 100

        if i%1000 == 0:
            alpha *= alpha_change_rate
            print("Mikolov distance: {}".format(mikolov(X, Y, W)))
    return W

In [7]:
turkish_stemmer = TurkishStemmer()
def clean(text, language="en", stem=True):
    global turkish_stemmer
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').lower().decode("ascii")
    
    if language == "tr":
        if stem:
            text= ' '.join([turkish_stemmer.stem(w) for w in text.split()])
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r'[0-9]', '#', text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"e(\s)?-(\s)?mail", "email", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    return TextBlob(text)


In [8]:
# def record_translations():
#     for lang in data_dict:
#         for score in data_dict[lang]:
#             for movie in data_dict[lang][score]:
#                 for review in data_dict[lang][score][movie]:
#                     try:
#                         blob = clean(review)
#                         if review in globals()[d[lang]]:
#                             ent = globals()[d[lang]][review]
#                         else:
#                             ent = str(blob.translate(to=d[lang][-2:]))
#                             globals()[d[lang]][review] = ent
#                     except:
#                         continue

In [9]:
# record_translations()
# pickle.dump(en2tr, open("datasets/en2tr","wb"))
# pickle.dump(tr2en, open("datasets/tr2en","wb"))

In [56]:
en2tr = pickle.load(open("datasets/en2tr","rb"))
tr2en = pickle.load(open("datasets/tr2en","rb"))

In [78]:
VECTOR_SIZE = 300
def vectorize(text, language, translate=False):
    global VECTOR_SIZE, en2tr, tr2en
    if translate:
        if language=="tr":
            try:
                text = en2tr[text]
            except KeyError:
                try:
                    translated = str(TextBlob(text).translate(to=language))
                except:
                    warnings.warn("Can't translate invalid English Review."+text[:10]+"...")
                    return None
                en2tr[text] = translated
                tr2en[translated] = text
                text = translated
        else:
            try:
                text = tr2en[text]
            except KeyError:
                try:
                    translated = str(TextBlob(text).translate(to=language))
                except:
                    warnings.warn("Can't translate invalid Turkish Review."+text[:10]+"...")
                    return None
                tr2en[text] = translated
                en2tr[translated] = text
                text = translated
                
    blob = clean(text, language)
    vector = np.zeros(VECTOR_SIZE)
    if len(blob.words) < 1:
        return None

    for word in blob.words:
        try:
            if language == "en":
                vector += globals()["en_vects"][word]
            else:
                vector += globals()["tr_vects"][word]
        except KeyError:
            continue
    vector /= len(blob.words)
    return vector

In [79]:
def get_X_size(data_dict, lang):
    res = 0
    for score in data_dict[lang]:
        for movie in data_dict[lang][score]:
                for tr_rev in data_dict[lang][score][movie]:
                    res +=1
    return res

def get_X2_size(data_dict):
    res = 0
    for score in data_dict["tr"]:
        for movie in data_dict["tr"][score]:
            try:
                for en_rev in data_dict["en"][score][movie]:
                    res += 1
            except KeyError:  ## there are no english review for that movie with the same score
                continue
    return res

In [157]:
def fit(df, train_index):
    data_dict = dict() #{language:{score: {movie_id: [rev1, rev2, ..., revn]}}}
    for i in train_index:
        row = df.loc[i]
        lang = row[0]
        movie_id = row[1]
        review = row[2]
        score = row[3]

        data_dict.setdefault(lang, {})
        data_dict[lang].setdefault(score, {})
        data_dict[lang][score].setdefault(movie_id, [])
        data_dict[lang][score][movie_id].append(review)
        
#     X1_size = get_X_size(data_dict, "tr")
#     X1 = list(np.zeros((X1_size, 300)))
#     Y1 = list(np.zeros((X1_size, 300)))

    X1 = list()
    Y1 = list()

    
#     X2_size = get_X2_size(data_dict)
#     X2 = list(np.zeros((X2_size, 300)))
#     Y2 = list(np.zeros((X2_size, 300)))
    
    X2 = list()
    Y2 = list()
    
    
#     X3_size = get_X_size(data_dict, "en")
#     X3= list(np.zeros(((X3_size, 300))))
#     Y3 = list(np.zeros(((X3_size, 300))))
    X3 = list()
    Y3 = list()
    
    y3 = list()

    
#     print(np.array(X1).shape, np.array(Y1).shape)
#     print(np.array(X2).shape, np.array(Y2).shape)
#     print(np.array(X3).shape, np.array(Y3).shape, np.array(y3).shape)
    
    
    i = 0
    j = 0
    k = 0
    for lang in data_dict:
        for score in data_dict[lang]:
            for movie in data_dict[lang][score]:
                if lang == "tr":
                    ## First Layer
                    for tr_rev in data_dict[lang][score][movie]:
                            y_ = vectorize(tr_rev, language="en", translate=True)
                            if y_ is None:
                                continue
                            X1.append(np.array(vectorize(tr_rev, language="tr")))
                            Y1.append(np.array(y_))
#                             X1[i] = np.array(vectorize(tr_rev, language="tr"))
#                             Y1[i] = np.array(y_)
                            i +=1 
                        
                    ### Second Layer
                    try:
                        for en_rev in data_dict["en"][score][movie]:
                            y_ = vectorize(en_rev, language="tr", translate=True)
                            if y_ is None:
                                continue
                            X2.append(np.array(vectorize(en_rev, language="en")))
                            Y2.append(np.array(y_))
#                             X2[j] = np.array(vectorize(en_rev, language="en"))
#                             Y2[j] = np.array(y_)
                            j += 1
                    except KeyError:  ## there are no english review for that movie with the same score
                        continue
                else:
                    ## Third Layer
                    for en_rev in data_dict[lang][score][movie]:
                        x_ = vectorize(en_rev, language="tr", translate=True)
                        if x_ is None:
                            continue
                        X3.append(np.array(x_))    
                        Y3.append(np.array(vectorize(en_rev, language="en")))
                        y3.append(np.array([score]))
#                         X3[k] = np.array(x_)
#                         Y3[k] = np.array(vectorize(en_rev, language="en"))
#                         y3[k] = np.array([score])
                        k+=1
    
    X1 = np.array(X1)
    Y1 = np.array(Y1)
    
    X2 = np.array(X2)
    Y2 = np.array(Y2)
    
    X3 = np.array(X2)
    Y3 = np.array(Y2)
    y3 = np.array(y3)
    
    print(X1.shape, Y1.shape)
    print(X2.shape, Y2.shape)
    print(X3.shape, Y3.shape, y3.shape)
    print("-"*50)
    W1 = MLPRegressor(random_state=42)
    W1.fit(X1, Y1)
    
    W2 = MLPRegressor(random_state=42)
    W2.fit(X2, Y2)
    
    W3 = MLPRegressor(random_state=42)
    W3.fit(X3, Y3)
    
    W4 = MLPClassifier(random_state=42)
    W4.fit(Y3, y3)
    return W1, W2, W3, W4

In [158]:
def predict(df, test_index, W1, W2, W3, W4):
    X = list(np.zeros(len(test_index), 300))
    y = list(np.zeros(len(test_index)))
    for i in range(len(test_index)):
        row = df.loc[test_index[i]]
        review = row[2]
        score = row[3]
        lang = row[0]
        X[i] = vectorize(review, lang)
        y[i] = score
        
    X = np.array(X)
    y = np.array(y)
    
    l1 =W1.predict(X)
    l2 = W2.predict(l1)
    l3 = W3.predict(l2)
    pred_scores = W4.predict(l3)
    return pred_scores, y
        

In [159]:
def distance_accuracy(y_true, y_predict):
    res = 0
    for i in range(len(y_true)):
        res += abs(y_true[i]-y_predict[i])
    return 1-res/(len(y_true)*len(set(y_true)))

In [160]:
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
X = df["Review"]
y = df["Language"]

In [161]:
for train_index, test_index in skf.split(X, y):
    W1, W2, W3, W4 = fit(df, train_index)
    preds, true = predict(df, test_index, W1, W2, W3, W4)
    for metric in score_dict:
        score_dict[metric] += globals()[metric](l_test, pred_scores)
for metric in score_dict:
    score_dict[metric] /= 10.0



(444, 300) (444, 300)
(134, 300) (134, 300)
(134, 300) (134, 300) (450, 1)
--------------------------------------------------


ValueError: Found input variables with inconsistent numbers of samples: [134, 450]

In [None]:
score_dict