In [1]:
import pandas as pd, numpy as np, ast, re, pickle, ast
np.random.seed(42)

In [2]:
def parse_np_array(array_string, as_nparray=True):
    pattern = r'''# Match (mandatory) whitespace between...
              (?<=\]) # ] and
              \s+
              (?= \[) # [, or
              |
              (?<=[^\[\]\s]) 
              \s+
              (?= [^\[\]\s]) # two non-bracket non-whitespace characters
           '''
    fixed_string = re.sub(pattern, ',', array_string, flags=re.VERBOSE)
    if as_nparray:
        return np.array(ast.literal_eval(fixed_string))
    return ast.literal_eval(fixed_string)

In [3]:
df = pd.read_csv("datasets/LMS_r_merged_reviews_per_movie_language_score.csv")
df["merged_reviews_vector"] = df["merged_reviews_vector"].apply(lambda x: parse_np_array(x) if type(x) == str and "[" in x else None)
df.head(5)

Unnamed: 0,Movie_ID,Language,Score,merged_reviews_vector
0,-2144779484,en,4,"[0.0766620412, 0.0301413387, 0.022697458, 0.08..."
1,-2144779484,en,9,"[0.05562439, 0.0306321, -0.00087419, 0.0592047..."
2,-2144779484,tr,7,"[-0.108544111, 0.235523254, 0.111588225, -0.09..."
3,-2144779484,tr,9,"[0.0311885718, 0.141891941, 0.0640149638, 0.15..."
4,-2104441082,en,10,"[0.02883458, 0.03170469, 0.00097861, 0.0681932..."


In [4]:
df.groupby(["Language","Score"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Movie_ID,merged_reviews_vector
Language,Score,Unnamed: 2_level_1,Unnamed: 3_level_1
en,1,17,17
en,2,8,8
en,3,14,14
en,4,18,18
en,5,25,25
en,6,26,26
en,7,43,43
en,8,52,52
en,9,75,75
en,10,111,111


## Merging all movies with the same score for each language

In [5]:
def merging_function(frame):
    return np.mean(frame["merged_reviews_vector"])

In [6]:
merged_by_lang_and_movies = df.groupby(["Language","Score"], as_index=False).apply(merging_function).to_frame()
merged_by_lang_and_movies

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Language,Score,Unnamed: 2_level_1
en,1,"[0.0564090564882, 0.0310473006471, 0.001661979..."
en,2,"[0.056524475, 0.03135834625, 0.00357995875, 0...."
en,3,"[0.0568037516143, 0.0378051802429, 0.002676630..."
en,4,"[0.0593783255667, 0.0355334800222, 0.003506247..."
en,5,"[0.0576580418, 0.034522828712, 0.0008603620527..."
en,6,"[0.0546367849654, 0.0346027090462, 0.002850217..."
en,7,"[0.0532821751884, 0.0347177720233, -2.28056106..."
en,8,"[0.0559765238558, 0.0362633347712, 0.000232104..."
en,9,"[0.0541100169933, 0.0346257522563, 0.001568747..."
en,10,"[0.0540575701784, 0.0336265408847, 0.002325572..."


In [7]:
merged_by_lang_and_movies.reset_index(inplace=True)

In [8]:
"There are {} movies".format(len(df.groupby("Movie_ID")))

'There are 224 movies'

# Minimizing the distance between Score vectors in different languages

In [9]:
def mikolov(X, Y, W):
    # min_W  for each i    ||W.x(i) - y(i)||^2
    result = 0
    for score in range(len(X)):
        result += np.linalg.norm(W.dot(X[score]) - Y[score])**2
    return result

In [10]:
en_revs = dict()
tr_revs = dict()
for movie in df.set_index("Movie_ID").iterrows():
    vec = movie[1]["merged_reviews_vector"]
    lang = movie[1]["Language"]
    score = movie[1]["Score"]
    if lang == "en":
        en_revs[score] = vec
    else:
        tr_revs[score] = vec

In [11]:
def learn_translation_matrix(X,Y, iterations=5000, alpha=0.0001, alpha_change_rate=0.8):
    W = np.random.random((300, 300))
    for i in range(iterations+1):
        gradient = np.zeros(300)
        for score in range(len(X)):
            error = X[score].dot(W) - Y[score]
            gradient += alpha * np.gradient(error)
        W += gradient
        if i == 2000:
            alpha /= 100

        if i%1000 == 0:
            alpha *= alpha_change_rate
            print("Mikolov distance: {}".format(mikolov(X, Y, W)))
    return W

In [12]:
scores = sorted([i for i in tr_revs.keys() if i in en_revs.keys()])

In [13]:
En_score_vecs = np.array([en_revs[sv] for sv in scores])  # English score vectors
Tr_score_vecs = np.array([tr_revs[sv] for sv in scores])  # Turkish score vectors

In [15]:
from sklearn.neural_network import MLPRegressor

In [16]:
W = MLPRegressor()
W.fit(En_score_vecs, Tr_score_vecs)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

# Merging score vectors across languages

In [20]:
# def merge_cross_lingual_score_vectors(En_score_vecs, Tr_score_vecs, scores, W):
#     labeled_vecs = dict()
#     for score in range(len(scores)):
#         labeled_vecs[scores[score]] = np.mean(\
#             np.array(\
#                 [En_score_vecs[score].dot(W), Tr_score_vecs[score]]\
#                     ), axis=0)
#     return labeled_vecs
def merge_cross_lingual_score_vectors(En_score_vecs, Tr_score_vecs, scores, W):
    labeled_vecs = dict()
    for score in range(len(scores)):
        labeled_vecs[scores[score]] = np.mean(\
            W.predict(np.atleast_2d(En_score_vecs[score])\
                    ), axis=0)
    return labeled_vecs

In [21]:
labeled_vecs = merge_cross_lingual_score_vectors(En_score_vecs, Tr_score_vecs, scores, W)

In [None]:
pickle.dump(labeled_vecs, open("score_vectors_dict", "wb"))