In [1]:
import pickle, pandas as pd, re, numpy as np, ast, warnings

from collections import defaultdict, OrderedDict
from itertools import chain, starmap
from itertools import product
import unicodedata
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

from TurkishStemmer import TurkishStemmer
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from textblob import TextBlob

In [2]:
df = pd.read_csv("datasets/movie_data.csv")
df.head()

Unnamed: 0,Language,Movie_ID,Review,Score
0,en,-800777728,i love science fiction and i hate superheroes ...,9
1,en,-800777728,the movie is absolutely incredible all the per...,10
2,en,-1018312192,in a cinematic era dominated by reboots and mi...,8
3,en,-1018312192,movie review on rise of the planet of the apes...,4
4,en,-1018312192,during experiments to find a cure for alzheime...,7


In [3]:
en_vects = gensim.models.KeyedVectors.load_word2vec_format(r"../NLP_data/GoogleNews-vectors-negative300.bin", binary=True)

In [4]:
tr_vects = gensim.models.KeyedVectors.load_word2vec_format(r"../NLP_data/wiki.tr/wiki.tr.vec", binary=False)

In [5]:
turkish_stemmer = TurkishStemmer()
def clean(text, language="en", stem=True):
    global turkish_stemmer
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').lower().decode("ascii")
    
    if language == "tr":
        if stem:
            text= ' '.join([turkish_stemmer.stem(w) for w in text.split()])
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r'[0-9]', '#', text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"e(\s)?-(\s)?mail", "email", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    return TextBlob(text)


In [11]:
VECTOR_SIZE = 300
def vectorize(text, language):
    global VECTOR_SIZE            
    blob = clean(text, language)
    vector = np.zeros(VECTOR_SIZE)
    if len(blob.words) < 1:
        return None

    for word in blob.words:
        try:
            if language == "en":
                vector += globals()["en_vects"][word]
            else:
                vector += globals()["tr_vects"][word]
        except KeyError:
            continue
    vector /= len(blob.words)
    return vector

In [14]:
def getvec(x):
    lang, rev = x.split(":::::")
    return vectorize(rev, lang)

In [15]:
def merging_function(frame):
    return np.mean(frame["rev_vec"])

In [16]:
# LMSR
def preprocess_data(df, language_column="Language", review_column="Review"):
    LMSR_df = df.copy()
    LMSR_df["lang_rev"] = LMSR_df[[language_column, review_column]].apply(lambda x: x[0]+":::::"+x[1], axis=1)
    LMSR_df["rev_vec"] = LMSR_df["lang_rev"].apply(lambda x:getvec(x))
    LMSR_df.drop(["lang_rev", "Review"], axis=1, inplace=True)
    return LMSR_df

In [1]:
def get_score_vects(LMSR_df, movie_id_column="Movie_ID", language_column="Language", score_column="Score"):
    LMS_r = LMSR_df.groupby([movie_id_column,language_column,score_column], as_index=False).apply(merging_function)
    LMS_r_df = LMS_r.reset_index().rename({0:"rev_vec"}, axis=1)
    
    merged_by_lang_and_movies = LMS_r_df.groupby(
        [language_column,
         score_column],
        as_index=False).apply(merging_function).to_frame()
    merged_by_lang_and_movies.reset_index(inplace=True)
    
    en_revs = dict()
    tr_revs = dict()
    for movie in LMS_r_df.set_index(movie_id_column).iterrows():
        vec = movie[1]["rev_vec"]
        lang = movie[1][language_column]
        score = movie[1][score_column]
        if lang == "en":
            en_revs[score] = vec
        else:
            tr_revs[score] = vec
    scores = sorted([i for i in tr_revs.keys() if i in en_revs.keys()])
    
    En_score_vecs = np.array([en_revs[sv] for sv in scores])  # English score vectors
    Tr_score_vecs = np.array([tr_revs[sv] for sv in scores])  # Turkish score vectors
    
    # Minimizing the distance between Score vectors in different languages
    W = MLPRegressor()
    W.fit(En_score_vecs, Tr_score_vecs)
    
    # Merging score vectors across languages
    scores_vects = dict()
    for score in range(len(scores)random_satate):
        scores_vects[scores[score]] = np.mean(\
            W.predict(np.atleast_2d(En_score_vecs[score])\
                    ), axis=0)
    return scores_vects

In [139]:
def distance_accuracy(y_true, y_predict):
    res = 0
    for i in range(len(y_true)):
        res += abs(y_true[i]-y_predict[i])
    return 1-res/(len(y_true)*len(set(y_true)))

In [145]:
def get_XYy(LMSR):
    X = np.zeros((len(LMSR), 300))
    Y = np.zeros((len(LMSR), 300))
    y = np.zeros((len(LMSR)))
    i = 0
    for rev in LMSR.iterrows():
        score = rev[1][2]
        rev_vec = rev[1][3]
        score_vec = rev[1][4]

        X[i] = rev_vec
        Y[i] = score_vec
        y[i] = score

        i += 1
    return X, Y, y

In [146]:
def fit(df, train_index, regressor=MLPRegressor(), classifier=MLPClassifier(), dim_reduct=PCA(), random_state=42):
    LMSR = preprocess_data(df.loc[train_index])
    score_vect_dicts = get_score_vects(LMSR)
    LMSR["score_vec"] = LMSR["Score"].apply(lambda x: score_vect_dicts[x] if x in score_vect_dicts else np.NaN)
    LMSR.dropna(inplace=True)
    
    X, Y, y = get_XYy(LMSR)
    
    regressor. random_state = random_state
    classifier.random_state = random_state
    dim_reduct.random_state = random_state
    
    if dim_reduct is not None:
        dim_reduct.fit(X)
        dim_reduct.transform(X)
    regressor.fit(X, Y)
    classifier.fit(Y, y)
    return regressor, classifier

In [150]:
def predict(df, test_index, regressor, classifier):
    LMSR = preprocess_data(df.loc[test_index])
    score_vect_dicts = get_score_vects(LMSR)
    LMSR["score_vec"] = LMSR["Score"].apply(lambda x: score_vect_dicts[x] if x in score_vect_dicts else np.NaN)
    LMSR.dropna(inplace=True)
    
    X, Y, y = get_XYy(LMSR)
    
    preds_score_vecs = regressor.predict(X)
    pred_scores = classifier.predict(preds_score_vecs)
    
    return pred_scores, y

In [151]:
skf = StratifiedKFold(n_splits=10)

In [160]:
NUM_TRIALS = 10
scores = list(np.zeros(NUM_TRIALS))
for i in range(NUM_TRIALS):
    print("Trial:\t{}".format(i+1))
    score_dict = {"distance_accuracy":0}
    k = 0
    for train_index, test_index in skf.split(df["Review"], df["Language"]):
        regressor, classifier = fit(df, train_index, random_state=i)
        preds, true = predict(df, test_index, regressor, classifier)
        s = distance_accuracy(true, preds)
        score_dict["distance_accuracy"] += s
        k += 1
        print("K:\t{}\nScore:\t{}".format(k, s))
    score_dict["distance_accuracy"] /= 10.0
    scores[i] = score_dict["distance_accuracy"]
    print("*"*10)
    print("Trial{} avg score:\t {}".format(i, score_dict["distance_accuracy"]))
    print("-"*30)

Trial:	1




K:	1
Score:	0.7774436090225564
K:	2
Score:	0.7447916666666667
K:	3
Score:	0.7601010101010102
K:	4
Score:	0.7868480725623583
K:	5
Score:	0.7563775510204082
K:	6
Score:	0.6390168970814132
K:	7
Score:	0.7430555555555556
K:	8
Score:	0.781786941580756
K:	9
Score:	0.7453416149068324
K:	10
Score:	0.7195121951219512
**********
Trial0 avg score:	 0.7454275113619508
------------------------------
Trial:	2
K:	1
Score:	0.7443609022556391
K:	2
Score:	0.72265625
K:	3
Score:	0.7550505050505051
K:	4
Score:	0.7868480725623583
K:	5
Score:	0.5051020408163265
K:	6
Score:	0.7572964669738863
K:	7
Score:	0.7135416666666667
K:	8
Score:	0.7474226804123711
K:	9
Score:	0.7888198757763976
K:	10
Score:	0.7286585365853658
**********
Trial1 avg score:	 0.7249756997099517
------------------------------
Trial:	3
K:	1
Score:	0.7669172932330828
K:	2
Score:	0.7135416666666667
K:	3
Score:	0.7815656565656566
K:	4
Score:	0.8106575963718821
K:	5
Score:	0.7206632653061225
K:	6
Score:	0.7511520737327189
K:	7
Score:	0.673611111

In [162]:
stats = pd.DataFrame(list(scores))
stats.describe()

Unnamed: 0,0
count,10.0
mean,0.741597
std,0.012171
min,0.723515
25%,0.730468
50%,0.744698
75%,0.750284
max,0.755527
