In [1]:
import pandas as pd, numpy as np, ast, re

In [2]:
df = pd.read_csv("datasets/LMS_r_merged_reviews_per_movie_language_score.csv")
df.head(5)

Unnamed: 0,Movie_ID,Language,Score,merged_reviews_vector
0,-2144779484,en,4,[ 0.07401991 0.04054779 0.03553367 0.070901...
1,-2144779484,en,9,[ 6.25433506e-02 3.19293781e-02 3.0851420...
2,-2144779484,tr,7,[-0.04627872 -0.12108444 0.12996513 0.056607...
3,-2144779484,tr,9,[-0.02786085 -0.07367174 0.07948475 0.034825...
4,-2104441082,en,10,[ 2.95819733e-02 3.01975626e-02 1.0135583...


In [3]:
merged_by_lang_and_movies = df.groupby(["Language","Score"]).count()
merged_by_lang_and_movies

Unnamed: 0_level_0,Unnamed: 1_level_0,Movie_ID,merged_reviews_vector
Language,Score,Unnamed: 2_level_1,Unnamed: 3_level_1
en,1,17,17
en,2,8,8
en,3,14,14
en,4,18,18
en,5,25,25
en,6,26,26
en,7,43,43
en,8,52,52
en,9,75,75
en,10,111,111


In [4]:
"There are {} movies".format(len(df.groupby("Movie_ID")))

'There are 224 movies'

## Merging all movies with the same score for each language

In [5]:
# len(df.groupby(["Language", "Score"])["merged_reviews_vector"])

In [6]:
def mikolov(X, Y, W):
    # min_W  for each i    ||W.x(i) - y(i)||^2
    result = 0
    for score in range(len(X)):
        result += np.linalg.norm(W.dot(X[score]) - Y[score])**2
    return result

In [7]:
en_revs = dict()
tr_revs = dict()
for movie in df.set_index("Movie_ID").iterrows():
    vec = movie[1]["merged_reviews_vector"]
    lang = movie[1]["Language"]
    score = movie[1]["Score"]
    if lang == "en":
        en_revs[score] = vec
    else:
        tr_revs[score] = vec

In [8]:
import ast
def parse_np_array(array_string, as_nparray=True):
    pattern = r'''# Match (mandatory) whitespace between...
              (?<=\]) # ] and
              \s+
              (?= \[) # [, or
              |
              (?<=[^\[\]\s]) 
              \s+
              (?= [^\[\]\s]) # two non-bracket non-whitespace characters
           '''
    fixed_string = re.sub(pattern, ',', array_string, flags=re.VERBOSE)
    if as_nparray:
        return np.array(ast.literal_eval(fixed_string))
    return ast.literal_eval(fixed_string)

In [9]:
def learn_translation_matrix(X,Y, iterations=20000, alpha=0.0001, alpha_change_rate=0.8):
    np.random.seed(42)
    W = np.random.random((300, 300))
    for i in range(iterations):
        gradient = np.zeros(300)
        for score in range(len(X)):
            error = X[score].dot(W) - Y[score]
            gradient += alpha * np.gradient(error)
        W += gradient
        if i == 2000:
            alpha /= 100

        if i%1000 == 0:
            alpha *= alpha_change_rate
            print("Mikolov distance: {}".format(mikolov(X, Y, W)))
    return W

In [10]:
X = np.array([parse_np_array(en_revs[sv]) for sv in sorted(tr_revs.keys())])
Y = np.array([parse_np_array(tr_revs[sv]) for sv in sorted(tr_revs.keys())])

In [11]:
W = learn_translation_matrix(X, Y)

Mikolov distance: 449.9491346976322
Mikolov distance: 311.4378905734043
Mikolov distance: 279.6540619318195
Mikolov distance: 279.74430060538106
Mikolov distance: 279.8207080471185
Mikolov distance: 279.8845357392851
Mikolov distance: 279.9373286290398
Mikolov distance: 279.9806714415886
Mikolov distance: 280.01605555633625
Mikolov distance: 280.0448173785205
Mikolov distance: 280.06811784670964
Mikolov distance: 280.08694452473344
Mikolov distance: 280.10212513043473
Mikolov distance: 280.1143459583688
Mikolov distance: 280.1241714880865
Mikolov distance: 280.1320631908741
Mikolov distance: 280.13839657366697
Mikolov distance: 280.1434760940823
Mikolov distance: 280.1475479120136
Mikolov distance: 280.150810615649
