In [1]:
import warnings, numpy as np, re, json, pandas as pd, pickle, unicodedata
# try:
#     import gnumpy as gpu
# except ModuleNotFoundError:
#     pass
from TurkishStemmer import TurkishStemmer
from joblib import Parallel, delayed

warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim, math, time
import tensorflow as tf
from gensim.models import doc2vec
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
# from KaggleWord2VecUtility import KaggleWord2VecUtility

  from ._conv import register_converters as _register_converters


In [2]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
reset_graph()

In [3]:
df = pd.read_csv("datasets/movie_data.csv")
df.head(5)

Unnamed: 0,Language,Movie_ID,Review,Score
0,en,-800777728,i love science fiction and i hate superheroes ...,9
1,en,-800777728,the movie is absolutely incredible all the per...,10
2,en,-1018312192,in a cinematic era dominated by reboots and mi...,8
3,en,-1018312192,movie review on rise of the planet of the apes...,4
4,en,-1018312192,during experiments to find a cure for alzheime...,7


In [4]:
d = {"tr":"tr2en", "en":"en2tr"}
en2tr = dict()
tr2en = dict()

In [5]:
en_vects = gensim.models.KeyedVectors.load_word2vec_format(r"../NLP_data/GoogleNews-vectors-negative300.bin", binary=True)

In [6]:
tr_vects = gensim.models.KeyedVectors.load_word2vec_format(r"../NLP_data/wiki.tr/wiki.tr.vec", binary=False)

In [7]:
def learn_translation_matrix(X,Y, iterations=5000, alpha=0.0001, alpha_change_rate=0.8):
    W = np.random.random((300, 300))
    for i in range(iterations+1):
        gradient = np.zeros(300)
        for score in range(len(X)):
            error = X[score].dot(W) - Y[score]
            gradient += alpha * np.gradient(error)
        W += gradient
        if i == 2000:
            alpha /= 100

        if i%1000 == 0:
            alpha *= alpha_change_rate
            print("Mikolov distance: {}".format(mikolov(X, Y, W)))
    return W

In [8]:
turkish_stemmer = TurkishStemmer()
def clean(text, language="en", stem=True):
    global turkish_stemmer
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').lower().decode("ascii")
    
    if language == "tr":
        if stem:
            text= ' '.join([turkish_stemmer.stem(w) for w in text.split()])
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r'[0-9]', '#', text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"e(\s)?-(\s)?mail", "email", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    return TextBlob(text)


In [9]:
# def record_translations():
#     for lang in data_dict:
#         for score in data_dict[lang]:
#             for movie in data_dict[lang][score]:
#                 for review in data_dict[lang][score][movie]:
#                     try:
#                         blob = clean(review)
#                         if review in globals()[d[lang]]:
#                             ent = globals()[d[lang]][review]
#                         else:
#                             ent = str(blob.translate(to=d[lang][-2:]))
#                             globals()[d[lang]][review] = ent
#                     except:
#                         continue

In [10]:
# record_translations()
# pickle.dump(en2tr, open("datasets/en2tr","wb"))
# pickle.dump(tr2en, open("datasets/tr2en","wb"))

In [11]:
en2tr = pickle.load(open("datasets/en2tr","rb"))
tr2en = pickle.load(open("datasets/tr2en","rb"))

In [12]:
VECTOR_SIZE = 300
def vectorize(text, language, translate=False):
    global VECTOR_SIZE, en2tr, tr2en
    if translate:
        if language=="tr":
            try:
                text = en2tr[text]
            except KeyError:
                try:
                    translated = str(TextBlob(text).translate(to=language))
                except:
                    warnings.warn("Can't translate invalid English Review."+text[:10]+"...")
                    return None
                en2tr[text] = translated
                tr2en[translated] = text
                text = translated
        else:
            try:
                text = tr2en[text]
            except KeyError:
                try:
                    translated = str(TextBlob(text).translate(to=language))
                except:
                    warnings.warn("Can't translate invalid Turkish Review."+text[:10]+"...")
                    return None
                tr2en[text] = translated
                en2tr[translated] = text
                text = translated
                
    blob = clean(text, language)
    vector = np.zeros(VECTOR_SIZE)
    if len(blob.words) < 1:
        return None

    for word in blob.words:
        try:
            if language == "en":
                vector += globals()["en_vects"][word]
            else:
                vector += globals()["tr_vects"][word]
        except KeyError:
            continue
    vector /= len(blob.words)
    return vector

In [13]:
def get_X_size(data_dict, lang):
    res = 0
    for score in data_dict[lang]:
        for movie in data_dict[lang][score]:
                for tr_rev in data_dict[lang][score][movie]:
                    res +=1
    return res

def get_X2_size(data_dict):
    res = 0
    for score in data_dict["tr"]:
        for movie in data_dict["tr"][score]:
            try:
                for en_rev in data_dict["en"][score][movie]:
                    res += 1
            except KeyError:  ## there are no english review for that movie with the same score
                continue
    return res

In [14]:
def get_data_dict(df_train):
    data_dict = dict()  # {(score, movie_id): {language: [reviews]}}
    for _, row in df_train.iterrows():
        lang = row[0]
        movie_id = row[1]
        review = row[2]
        score = row[3]

        data_dict.setdefault((score, movie_id), dict())
        data_dict[(score, movie_id)].setdefault(lang, list())
        data_dict[(score, movie_id)][lang].append(review)
    return data_dict

In [15]:
def one_hot(n, num_classes=10):
        s = np.zeros(num_classes)
        s[n-1] = 1
        return s

In [16]:
def dict2array(d):
    return np.array([d[i] for i in sorted(d)])

In [17]:
SOURCE = "en"
TARGET = "tr"
def get_training_batch(data_dict):
    """
    data_dict: # {(score, movie_id): {language: [reviews]}}
    """
    X1_dict = dict()   # Turkish reviews
    X2_dict = dict()   # English translation of the Turkish reviews
    X3_dict = dict()   # Turkish translation of the English reviews
    X4_dict = dict()   # English reviews
    Y_dict = dict()    # score

    for i, k in enumerate(data_dict):
        # k (score, movie_id)
        # data_dict[k] = {language: [reviews]}
        try:
            source_reviews = data_dict[k][SOURCE]
            target_reviews = data_dict[k][TARGET]
        except KeyError:
            # There is no similar scores for that movie in both languages, alignment not possible
            continue
        for rev in range(max(len(source_reviews), len(target_reviews))):
            X2_ = vectorize(target_reviews[rev%len(target_reviews)], language=SOURCE, translate=True)
            X3_ = vectorize(source_reviews[rev%len(source_reviews)], language=TARGET, translate=True)
            translation_error = False
            for v in [X2_, X3_]:
                if v is None:
                    translation_error = True
                    break
            if translation_error: continue
            X1_dict[i+rev] = vectorize(target_reviews[rev%len(target_reviews)], language=TARGET)
            X2_dict[i+rev] = X2_
            X3_dict[i+rev] = X3_
            X4_dict[i+rev] = vectorize(source_reviews[rev%len(source_reviews)], language=SOURCE)
            Y_dict[i+rev] = one_hot(k[0])
    
    X1, X2, X3, X4, Y = Parallel(n_jobs=-1)(delayed(dict2array)
                                         (v)
                                         for v in [X1_dict, X2_dict, X3_dict, X4_dict, Y_dict])
    
    return X1, X2, X3, X4, Y

In [18]:
def fit(df_train, learning_rate=0.1, epochs=100):
    data_dict = get_data_dict(df_train) # {(score, movie_id): {language: [reviews]}}
    training_curve = dict()
    X1 = tf.placeholder(tf.float32, [None, 300])
    X2 = tf.placeholder(tf.float32, [None, 300])
    X3 = tf.placeholder(tf.float32, [None, 300])
    X4 = tf.placeholder(tf.float32, [None, 300])
    Y = tf.placeholder(tf.float32, [None, 10])
    
    W1 = tf.Variable(tf.zeros([300, 300]))
    b1 = tf.Variable(tf.zeros([300]))
    
    W2 = tf.Variable(tf.zeros([300, 300]))
    b2 = tf.Variable(tf.zeros([300]))
    
    W3 = tf.Variable(tf.zeros([300, 300]))
    b3 = tf.Variable(tf.zeros([300]))
    
    W4 = tf.Variable(tf.zeros([300, 10]))
    b4 = tf.Variable(tf.zeros([10]))
    
    
    pred = tf.nn.softmax(tf.matmul(X4, W4) + b4) # Softmax
    
    
    cost1 = tf.reduce_sum(tf.square(tf.nn.sigmoid(tf.matmul(X1, W1) + b1 - X2)))
    cost2 = tf.reduce_sum(tf.square(tf.nn.sigmoid(tf.matmul(X2, W2) + b2 - X3)))
    cost3 = tf.reduce_sum(tf.square(tf.sigmoid(tf.matmul(X3, W3) + b3 - X4)))
    cost4 = tf.reduce_mean(-tf.reduce_sum(Y*tf.log(pred), reduction_indices=1))
    cost = cost1 + cost2 + cost3 + cost4
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    
    X1_, X2_, X3_, X4_, Y_ = get_training_batch(data_dict)
    assert len(X1_) == len(X2_) == len(X3_) == len(X4_) == len(Y_)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for e in range(epochs):
            avg_cost = 0.
            for i in range(len(X1_)):
                _, c, w1, w2, w3, w4 = sess.run([optimizer, cost, W1, W2, W3, W4],
                                                feed_dict={X1: np.atleast_2d(X1_[i]),
                                                           X2: np.atleast_2d(X2_[i]),
                                                           X3: np.atleast_2d(X3_[i]),
                                                           X4: np.atleast_2d(X4_[i]),
                                                           Y: np.atleast_2d(Y_[i])})
                avg_cost += c
            avg_cost /= len(df_train)
            training_curve[e] = avg_cost
            if e%10==0:
                print("Epoch {}: {}".format(e, avg_cost/len(df_train)))

    return w1, w2, w3, w4, training_curve

In [19]:
def predict(LSMR, W1, W2, W3, W4):
    reset_graph()
    
    x = tf.placeholder(tf.float32, [None, 300])

    w1 = tf.placeholder(tf.float32, [300, 300])
    w2 = tf.placeholder(tf.float32, [300, 300])
    w3 = tf.placeholder(tf.float32, [300, 300])
    w4 = tf.placeholder(tf.float32, [300, 10])

    b1 = tf.Variable(tf.zeros([300]))
    b2 = tf.Variable(tf.zeros([300]))
    b3 = tf.Variable(tf.zeros([300]))
    b4 = tf.Variable(tf.zeros([10]))

    l2 = tf.matmul(x, w1) + b1
    l3 = tf.matmul(l2, w2) + b2
    l4 = tf.matmul(l3, w3) + b3
    pred = tf.nn.softmax(tf.matmul(l4, w4) + b4)
    
    prediction = tf.argmax(pred, 1)
    preds = np.zeros(len(LSMR))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        j = 0
        for _, row in LSMR.iterrows():
            v = vectorize(row["Review"], row["Language"])
            preds[j] = prediction.eval({x: np.atleast_2d(v),
                                        w1:W1, w2:W2, w3:W3, w4:W4})
            j+=1
    
    
    return preds, np.array(list(LSMR.Score))

In [20]:
def distance_accuracy(y_true, y_predict):
    res = 0
    for i in range(len(y_true)):
        res += abs(y_true[i]-y_predict[i])
    return 1-res/(len(y_true)*len(set(y_true)))

In [21]:
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
X = df["Review"]
y = df["Language"]

In [25]:
NUM_TRIALS = 1
scores = dict()
learning_curves = dict()
for i in range(NUM_TRIALS):
    print("Trial:\t{}".format(i+1))
    scores[i] = dict()
    learning_curves[i] = dict()
    k = 0
    skf = StratifiedKFold(n_splits=10, random_state=i)
    for train_index, test_index in skf.split(df["Review"], df["Language"]):
        start = time.time()
        W1, W2, W3, W4, learning_curve = fit(df.loc[train_index], epochs=2000)
        preds, true = predict(df.loc[test_index], W1, W2, W3, W4)
        
        s = distance_accuracy(true, preds)
        scores[i][k] = s
        learning_curves[i][k] = learning_curve
        k += 1
        print("K:\t{}\nScore:\t{}".format(k, s))
        print("took:", time.time()-start)
        scores[i][k] = s
        learning_curves[i][k] = learning_curve
    print("*"*10)
    try:
        print("Trial {} avg score:\t {}".format(i+1, np.mean(list(scores[i].values()))))
    except:
        continue
    print("-"*30)

Trial:	1




Epoch 0: 0.007159850810486593
K:	1
Score:	0.5177777777777778
took: 4.208765029907227


Process ForkPoolWorker-35:
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
Process ForkPoolWorker-38:
Process ForkPoolWorker-37:
Process ForkPoolWorker-40:
Process ForkPoolWorker-34:
Process ForkPoolWorker-39:
Process ForkPoolWorker-33:
Process ForkPoolWorker-36:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multipr

KeyboardInterrupt: 

  File "/usr/local/lib/python3.5/dist-packages/joblib/pool.py", line 360, in get
    racquire()
KeyboardInterrupt


In [None]:
pickle.dump([scores, learning_curves], open("incremental_tf.results", "wb"))

In [None]:
stats = pd.DataFrame(scores)
stats

In [None]:
stats.describe()