In [1]:
import tensorflow as tf
import pickle, pandas as pd, re, numpy as np, ast, warnings

from joblib import Parallel, delayed

import time

from collections import defaultdict, OrderedDict
from itertools import chain, starmap
from itertools import product
import unicodedata
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

from TurkishStemmer import TurkishStemmer
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from textblob import TextBlob

  from ._conv import register_converters as _register_converters


In [2]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
reset_graph()

In [3]:
df_full = pd.read_csv("../datasets/movie_data.csv")
df_full.head()

Unnamed: 0,Language,Movie_ID,Review,Score
0,en,-800777728,i love science fiction and i hate superheroes ...,9
1,en,-800777728,the movie is absolutely incredible all the per...,10
2,en,-1018312192,in a cinematic era dominated by reboots and mi...,8
3,en,-1018312192,movie review on rise of the planet of the apes...,4
4,en,-1018312192,during experiments to find a cure for alzheime...,7


In [4]:
df_full.groupby("Score").count()

Unnamed: 0_level_0,Language,Movie_ID,Review
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,29,29,29
2,21,21,21
3,14,14,14
4,23,23,23
5,83,83,83
6,43,43,43
7,71,71,71
8,207,207,207
9,175,175,175
10,334,334,334


In [5]:
en_vects = gensim.models.KeyedVectors.load_word2vec_format(r"../../NLP_data/GoogleNews-vectors-negative300.bin", binary=True)

In [6]:
tr_vects = gensim.models.KeyedVectors.load_word2vec_format(r"../../NLP_data/wiki.tr/wiki.tr.vec", binary=False)

In [7]:
turkish_stemmer = TurkishStemmer()
def clean(text, language="en", stem=True):
    global turkish_stemmer
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').lower().decode("ascii")
    
    if language == "tr":
        if stem:
            text= ' '.join([turkish_stemmer.stem(w) for w in text.split()])
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r'[0-9]', '#', text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"e(\s)?-(\s)?mail", "email", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    return TextBlob(text)


In [8]:
VECTOR_SIZE = 300
def vectorize(text, language):
    global VECTOR_SIZE            
    blob = clean(text, language)
    vector = np.zeros(VECTOR_SIZE)
    if len(blob.words) < 1:
        return None

    for word in blob.words:
        try:
            if language == "en":
                vector += globals()["en_vects"][word]
            else:
                vector += globals()["tr_vects"][word]
        except KeyError as e:
#             warnings.warn(str(e))
            continue
    vector /= max(len(blob.words),1)
    return vector

In [9]:
def getvec(x):
    lang, rev = x.split(":::::")
    return vectorize(rev, lang)

In [10]:
# LMSR
def preprocess_data(df, language_column="Language", review_column="Review"):
    LMSR_df = df.copy()
    LMSR_df["lang_rev"] = LMSR_df[[language_column, review_column]].apply(lambda x: x[0]+":::::"+x[1], axis=1)
    LMSR_df["rev_vec"] = LMSR_df["lang_rev"].apply(lambda x:getvec(x))
    LMSR_df.drop(["lang_rev", "Review"], axis=1, inplace=True)
    return LMSR_df

In [11]:
def distance_accuracy(y_true, y_predict):
    res = 0
    for i in range(len(y_true)):
        res += abs(y_true[i]-y_predict[i])
    return 1-res/(len(y_true)*len(set(y_true)))

In [12]:
def get_XYy(LMSR):
    X = np.zeros((len(LMSR), VECTOR_SIZE))
    Y = np.zeros((len(LMSR), VECTOR_SIZE))
    y = np.zeros((len(LMSR)))
    i = 0
    for rev in LMSR.iterrows():
        score = rev[1][2]
        rev_vec = rev[1][3]
        score_vec = rev[1][4]

        X[i] = rev_vec
        Y[i] = score_vec
        y[i] = score

        i += 1
    return X, Y, y

In [13]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [14]:
def sigmoid(x, derive=False):
    if derive:
        return x*(1-x)
    return 1/(1+np.exp(-x))

In [15]:
def get_data_dict(df, get_L2and3=False):
    data_dict = dict() #{language:{score: {movie_id: [rev1, rev2, ..., revn]}}}
    L1 = dict()  # {(languge, score, movie_id): list of reviews with the same score with the same language}
    L2 = dict()  # {(language, score): None}
    L3 = dict()  # {score: None}
    for _, row in df.iterrows():
        lang = row["Language"]
        movie_id = row["Movie_ID"]
        score = row["Score"]
        review = row["rev_vec"]

        data_dict.setdefault(lang, {})
        data_dict[lang].setdefault(score, {})
        data_dict[lang][score].setdefault(movie_id, [])
        data_dict[lang][score][movie_id].append(review)
        
        L1.setdefault((lang, score, movie_id), list())
        L1[(lang, score, movie_id)].append(review)
        if get_L2and3:    
            L2[(lang, score)] = None
            L3[score] = None
    if get_L2and3:
        return data_dict, L1, L2, L3
    return data_dict, L1

In [16]:
def get_L2(LSM_R, data_dict):
    L2 = dict()  # {(language, score): list of movies vectors}
    for language in data_dict:
        for score in data_dict[language]:
            for movie_id in data_dict[language][score]:
                L2.setdefault((language, score), list())
                L2[(language, score)].append(LSM_R[(language, score, movie_id)])
    return L2

In [17]:
def get_L3(LS_MR, data_dict):
    L3 = dict()  # {score: vector of merged languages for that score}
    for language in data_dict:
        for score in data_dict[language]:
            L3.setdefault(score, list())
            L3[score].append(LS_MR[(language, score)])
    return L3

In [18]:
def merge(L, W):
    merged = dict()  # {item: vector of merged subitems}
    for i, item in enumerate(sorted(L)):
        for subitem in L[item]:
            merged.setdefault(item, [np.zeros(VECTOR_SIZE),0])
            merged[item][0] += sigmoid(subitem.dot(W[i]))
            merged[item][1] += 1
    for item in merged:
        merged[item] = merged[item][0]/ merged[item][1]
    return merged

In [19]:
def update_weights(L, delta, W, alpha=0.1):
    for i, k in enumerate(sorted(L)):
        for l in L[k]:
            W[i] += l.T.dot(delta[i]) *alpha
    return W

In [20]:
def get_layer_error(delta, W):
    error = 0
    for i in range(len(delta)):
        error += delta[i].dot(W[i].T)
    return error/len(delta)

In [21]:
def get_layer_delta(error, layer, size):
    delta = np.zeros((size, VECTOR_SIZE))
    j = 0
    for i,k in enumerate(sorted(layer)):
        for l in layer[k]:
            delta[j] = error[i]*sigmoid(l, True)
            j += 1
    return delta

## Training

## Full-batch

In [24]:
def get_score_vects(df, iterations=100, alpha=0.1, random_state=42, W1=None, W2=None, W3=None, W4=None):
    LSMR = preprocess_data(df)
    data_dict, L1 = get_data_dict(LSMR)
    y = softmax(list(LSMR.Score))
#     np.random.seed(random_state)
    learning_curve = dict()
    for i in range(iterations+1):
        # forward propagation
        if W1 is None:
            W1 = 2*np.random.random((len(L1), 300, 300))-1

        LSM_R = merge(L1, W1)
        L2 = get_L2(LSM_R, data_dict)
        if W2 is None:
            W2 = 2*np.random.random((len(L2), 300, 300))-1

        LS_MR = merge(L2, W2)
        L3 = get_L3(LS_MR, data_dict)
        if W3 is None:
            W3 = 2*np.random.random((len(L3), 300, 300))-1

        score_vectors_dict = merge(L3, W3)
        l4 = sigmoid(np.array([v for k, v in sorted(score_vectors_dict.items())]))
        if W4 is None:
            W4 = 2*np.random.random((300, len(LSMR)))-1
        
        l5 = softmax(l4.dot(W4))  # predicted scores
        
        # Calculate the error
        l5_error = np.mean(np.dot(np.log(l5), y))
        
        # Back propagation
        l5_delta = l5_error * sigmoid(l5, True)
        W4 += l4.T.dot(l5_delta)*alpha
        
        l4_error = l5_delta.dot(W4.T)
        l4_delta = l4_error * sigmoid(l4, True)
        
        W3 = update_weights(L3, l4_delta, W3, alpha)
        
        l3_error = get_layer_error(l4_delta, W3)
        l3_delta = get_layer_delta(l3_error, L3, len(L2))
        
        W2 = update_weights(L2, l3_delta, W2, alpha)
        
        l2_error = get_layer_error(l3_delta, W2)
        l2_delta = get_layer_delta(l2_error, L2, len(LSMR))
        
        W1 = update_weights(L1, l2_delta, W1, alpha)
        learning_curve[i] = l5_error
        if i%10 == 0:
            print("epoch {}:\t{}".format(i, np.abs(l5_error)))
        if i%100 == 0:
            alpha *= 0.9
    return LSMR, score_vectors_dict, learning_curve

In [25]:
def fit(LSMR, score_vect_dicts,random_state=42, regressor=MLPRegressor(), classifier=MLPClassifier()):
    LSMR["score_vec"] = LSMR["Score"].apply(lambda x: score_vect_dicts[x] if x in score_vect_dicts else np.NaN)
    LSMR.dropna(inplace=True)
    
    X, Y, y = get_XYy(LSMR)
    
    regressor.random_state = random_state
    classifier.random_state = random_state
        
    regressor.fit(X, Y)
    classifier.fit(Y, y)
    return regressor, classifier

In [26]:
def predict(LSMR, score_vect_dicts, regressor, classifier):
    LSMR["score_vec"] = LSMR["Score"].apply(lambda x: score_vect_dicts[x] if x in score_vect_dicts else np.NaN)
    LSMR.dropna(inplace=True)
    
    X, Y, y = get_XYy(LSMR)
    
    preds_score_vecs = regressor.predict(X)
    pred_scores = classifier.predict(preds_score_vecs)
    
    return pred_scores, y

In [27]:
tronly_test_raw = df_full[-100:]
tronly_test = preprocess_data(tronly_test_raw)
df = df_full[:-100]
tronly_test[tronly_test.Language=="en"].count()

Language    0
Movie_ID    0
Score       0
rev_vec     0
dtype: int64

In [28]:
def eval_models(model, train, test, tronly, ytrain, ytest, ytronly):
    _ = time.time()
    model.fit(train, ytrain)
    predtra = time.time()-_
    
    _ = time.time()
    s_train = distance_accuracy(ytrain, model.predict(train))
    trat = time.time()-_
    _ = time.time()
    s_test = distance_accuracy(ytest, model.predict(test))
    tet = time.time()-_
    _ = time.time()
    s_tr = distance_accuracy(ytronly, model.predict(tronly))
    trt = time.time()-_
    evals = OrderedDict()
    evals["Train"] = s_train
    evals["Test"] = s_test
    evals["Tr. Only"] = s_tr
    evals["Training Time"] = trat
    evals["Pred.Tra. Time"] = predtra
    evals["Testing Time"] = tet
    evals["Tr.Test Time"] = trt
    return evals

In [29]:
def get_total_average(scores_tables):
#     scores_tables: {i_th trial:
#                     {k_th fold:
#                         {'Model': {'Test': 0.8090301003344482,
#                                    'Train': 0.783361064891847,
#                                    'Turkish only': 0.7414285714285714}}}
    avgs = dict()
    for trial in scores_tables:
        for table in scores_tables[trial]:
            for model in scores_tables[trial][table]:
                avgs.setdefault(model, dict())
                for metric, score in scores_tables[trial][table][model].items():
                    avgs[model].setdefault(metric, list())
                    avgs[model][metric].append(score)
    for model in avgs:
        for metric in avgs[model]:
            avgs[model][metric] = np.mean(avgs[model][metric])
    return pd.DataFrame(avgs)

In [30]:
def get_trial_score(trial_scores_tables):
#  trial_scores_tables: {k_th fold:
#                             {'Model': {'Test': 0.8090301003344482,
#                                        'Train': 0.783361064891847,
#                                        'Turkish only': 0.7414285714285714}}}
    avgs = dict()
    for table in trial_scores_tables:
        for model in trial_scores_tables[table]:
            avgs.setdefault(model, dict())
            for metric, score in trial_scores_tables[table][model].items():
                avgs[model].setdefault(metric, list())
                avgs[model][metric].append(score)
    for model in avgs:
        for metric in avgs[model]:
            avgs[model][metric] = np.mean(avgs[model][metric])
    return pd.DataFrame(avgs)

In [33]:
NUM_TRIALS = 1
learning_curves = dict()
scores_tables = OrderedDict()
for i in range(NUM_TRIALS):
    print("Trial:\t{}".format(i+1))
    scores_tables[i] = OrderedDict()
    learning_curves[i] = OrderedDict()
    k = 0
    skf = StratifiedKFold(n_splits=10, random_state=i)
    for train_index, test_index in skf.split(df["Review"], df["Language"]):
        print("K:\t{}".format(k+1))
        scores_tables[i][k] = OrderedDict()
        start = time.time()
        LSMR, score_vect_dicts, training_curve = get_score_vects(
            df.loc[train_index], random_state=i, alpha=1e-5, iterations=50)
        regressor, classifier = fit(LSMR, score_vect_dicts, random_state=i)
        trat = time.time()- start
        
        test_data = preprocess_data(df.loc[test_index])
        _ = time.time()
        preds, true = predict(test_data, score_vect_dicts, regressor, classifier)
        tet = time.time()-_
        
        _ = time.time()
        preds_train, true_train = predict(preprocess_data(df.loc[train_index]),
                                          score_vect_dicts,
                                          regressor, classifier)
        predtra = time.time()-_
        
        _ = time.time()
        preds_tr, true_tr = predict(tronly_test, score_vect_dicts, regressor, classifier)
        trt = time.time()-_
        
        elapsed = time.time()-start
        
        s = distance_accuracy(true, preds)
        s_train = distance_accuracy(true_train, preds_train)
        s_tr = distance_accuracy(true_tr, preds_tr)
        
        
        lr = LogisticRegression(random_state=i)
        mlp = MLPClassifier(random_state=i)
        rf = RandomForestClassifier(random_state=i,n_jobs=-1)
        train_mat = np.array(list(LSMR["rev_vec"]))
        test_mat = np.array(list(test_data["rev_vec"]))
        tronly_mat = np.array(list(tronly_test["rev_vec"]))
        
        evals = OrderedDict()
        evals["Train"] = s_train
        evals["Test"] = s
        evals["Tr. Only"] = s_tr
        evals["Training Time"] = trat
        evals["Pred.Tra. Time"] = predtra
        evals["Testing Time"] = tet
        evals["Tr.Test Time"] = trt
        scores_tables[i][k]["DeepSelect"] = evals
        scores_tables[i][k]["MLP"] = eval_models(
            mlp, train_mat, test_mat, tronly_mat, true_train, true, true_tr)
        scores_tables[i][k]["Logistic Regression"] = eval_models(
            lr, train_mat, test_mat, tronly_mat, true_train, true, true_tr)
        scores_tables[i][k]["RandomForest"] = eval_models(
            rf, train_mat, test_mat, tronly_mat, true_train, true, true_tr)
        
        print()
        print("K:\t{}".format(k+1))
        print(pd.DataFrame(scores_tables[i][k]))
        print("\nThis fold took:", elapsed, "seconds\n")
        learning_curves[i][k] = training_curve
        k += 1
        print("*"*10+"\n")
    print("Average scores for trial {}".format(i))
    print(get_trial_score(scores_tables[i]))
    print("-"*30)
print("%%"*20)
print("Average of {} trials".format(NUM_TRIALS))
print(get_total_average(scores_tables))

Trial:	1
K:	1
epoch 0:	24.45730175287146
epoch 10:	23.96708852815955
epoch 20:	23.713081453867304
epoch 30:	23.527294032263164
epoch 40:	23.380831185653854
epoch 50:	23.235479365133063

K:	1
                DeepSelect       MLP  Logistic Regression  RandomForest
Pred.Tra. Time    1.627098  0.294413             0.150325      0.107511
Test              0.807407  0.811111             0.802469      0.798765
Testing Time      0.008493  0.000366             0.000142      0.105365
Tr. Only          0.784286  0.798571             0.781429      0.714286
Tr.Test Time      0.013459  0.000374             0.000141      0.105668
Train             0.809753  0.823333             0.823086      0.999259
Training Time    12.665944  0.002030             0.001024      0.104978

This fold took: 14.508926153182983 seconds

**********

K:	2
epoch 0:	22.457844475004883
epoch 10:	22.26013491539242
epoch 20:	22.148090851114496
epoch 30:	21.963549969361317
epoch 40:	21.784430638633225
epoch 50:	21.611525932121037

In [35]:
pickle.dump(scores_tables, open("batch_no_tf_tables.results", "wb"))

## Using the full network for prediction
### P.S. this variation supports online (incremental) training

In [36]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [37]:
def get_test(LSMR):
    X = dict()
    y = dict()
    for _, row in LSMR.iterrows():
        score = row["Score"]
        y_ = np.zeros(10)
        y_[score-1] = 1
        y[len(y)] = y_
        X[len(X)] = row["rev_vec"]
    return np.array(list(X.values())), np.array(list(y.values()))

In [38]:
def train_selective(df_train,epochs=100, learning_rate = 0.1, random_state=42, p_every=10):
    LSMR_train = preprocess_data(df_train)
    np.random.seed(random_state)
    data_dict, L1, L2, L3 = get_data_dict(LSMR_train, get_L2and3=True)
    init_weights = lambda layer, i, o: {k:2*np.random.random((i, o))-1 for k in layer}
    W1 = init_weights(L1, 300, 300)  # (languge, score, movie_id)
    W2 = init_weights(L2, 300, 300)  # (languge, score):
    W3 = init_weights(L3, 300, 10)  # score:
    
    
    reset_graph()
    x = tf.placeholder(tf.float32, [None, 300])
    y = tf.placeholder(tf.float32, [None, 10]) # 1-10 => 10 classes

    w1 = tf.placeholder(tf.float32, [300, 300])
    w2 = tf.placeholder(tf.float32, [300, 300])
    w3 = tf.placeholder(tf.float32, [300, 10])

    b1 = tf.Variable(tf.zeros([300]))
    b2 = tf.Variable(tf.zeros([300]))
    b3 = tf.Variable(tf.zeros([10]))

    l2 = tf.nn.sigmoid(tf.matmul(x, w1) + b1)
    l3 = tf.nn.sigmoid(tf.matmul(l2, w2) + b2)
    pred = tf.nn.softmax(tf.matmul(l3, w3) + b3)


    cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    training_curve = dict()
    with tf.device('/job:localhost/replica:0/task:0/device:GPU:0'):
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
            sess.run(tf.global_variables_initializer())
            for e in range(epochs+1):
                avg_cost = 0.
                for _, row in LSMR_train.iterrows():
                    lang = row["Language"]
                    movie_id = row["Movie_ID"]
                    score = row["Score"]
                    y_ = np.zeros(10)
                    y_[score-1] = 1
                    y_ = np.atleast_2d(y_)
                    x_ = np.atleast_2d(row["rev_vec"])
                    w1_,w2_,w3_,_, c = sess.run([w1, w2, w3, optimizer, cost],
                                             feed_dict={x: x_,
                                                        y: y_,
                                                        w1:W1[(lang, score, movie_id)],
                                                        w2:W2[(lang, score)],
                                                        w3:W3[score]})
                    W1[(lang, score, movie_id)] = w1_
                    W2[(lang, score)] = w2_
                    W3[score] = w3_

                    avg_cost += c
                training_curve[e] = avg_cost
                if e%p_every==0:
                    print("Epoch {}: {}".format(e, avg_cost/len(LSMR_train)))

            return W1, W2, W3, training_curve

In [39]:
def get_max_index(array):
    indx = None
    max_ = float("-inf")
    for i, e in enumerate(array):
        if e > max_:
            max_ = e
            indx = i
    return indx, max_

In [40]:
def predict_selective(df, W1, W2, W3):
    LSMR = preprocess_data(df)
    reset_graph()
    x = tf.placeholder(tf.float32, [None, 300])

    w1 = tf.placeholder(tf.float32, [300, 300])
    w2 = tf.placeholder(tf.float32, [300, 300])
    w3 = tf.placeholder(tf.float32, [300, 10])

    b1 = tf.Variable(tf.zeros([300]))
    b2 = tf.Variable(tf.zeros([300]))
    b3 = tf.Variable(tf.zeros([10]))

    l2 = tf.nn.sigmoid(tf.matmul(x, w1) + b1)
    l3 = tf.nn.sigmoid(tf.matmul(l2, w2) + b2)
    pred = tf.nn.softmax(tf.matmul(l3, w3) + b3)

    
    
    
    prediction = tf.argmax(pred, 1)
    preds = np.zeros(len(LSMR))
    with tf.device('/job:localhost/replica:0/task:0/device:GPU:0'):
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
            sess.run(tf.global_variables_initializer())
            j = 0
            for _, row in LSMR.iterrows():
                v = row["rev_vec"]
                predicted_scores = np.zeros(len(W1))
                for i, info in enumerate(W1):
                    language, score, movie_id = info
                    w_1 = W1[(language, score, movie_id)]
                    w_2 = W2[(language, score)]
                    w_3 = W3[score]

                    predicted_scores[i] = prediction.eval({x: np.atleast_2d(v),
                                                           w1:w_1,w2:w_2,w3:w_3})

                max_index, probability = get_max_index(softmax(predicted_scores))
                predicted_score = predicted_scores[max_index]

                preds[j] = predicted_score
                j+=1


    return preds, np.array(list(LSMR.Score))

In [41]:
NUM_TRIALS = 1
learning_curves = OrderedDict()
scores_tables_nn = OrderedDict()
for i in range(NUM_TRIALS):
    print("Trial:\t{}".format(i+1))
    learning_curves[i] = OrderedDict()
    k = 0
    skf = StratifiedKFold(n_splits=10, random_state=i)
    scores_tables_nn[i] = dict()
    for train_index, test_index in skf.split(df["Review"], df["Language"]):
        print("K: \t{}".format(k+1))
        scores_tables_nn[i][k] = OrderedDict()
        start = time.time()
        # approx 3 epochs per second
        LSMR = preprocess_data(df.loc[train_index])
        W1, W2, W3, training_curve = train_selective(df.loc[train_index], epochs=150, p_every=25)
        _ = time.time()
        trat = _-start
        print("Took: {} for training".format(trat))
        
        _ = time.time()
        preds_train, true_train = predict_selective(df.loc[train_index], W1, W2, W3)
        predtra = time.time()-_
        print("Took: {} for predicting {} training instances".format(predtra, len(train_index)))
        
        test_data = preprocess_data(df.loc[test_index])
        _ = time.time()
        preds, true = predict_selective(df.loc[test_index], W1, W2, W3)
        tet = time.time()-_
        print("Took: {} for predicting {} test instances".format(tet, len(test_index)))
        
        _ = time.time()
        preds_tr, true_tr = predict_selective(tronly_test_raw, W1, W2, W3)
        trt = time.time()-_
        print("Took: {} for predicting {} Turkish test instances".format(trt, len(tronly_test)))
        
        elapsed = time.time()-start

        s = distance_accuracy(true, preds)
        s_train = distance_accuracy(true_train, preds_train)
        s_tr = distance_accuracy(true_tr, preds_tr)
        mlp = MLPClassifier(random_state=i)
        lr = LogisticRegression(random_state=i)
        rf = RandomForestClassifier(random_state=i,n_jobs=-1)
        train_mat = np.array(list(LSMR["rev_vec"]))
        test_mat = np.array(list(test_data["rev_vec"]))
        tronly_mat = np.array(list(tronly_test["rev_vec"]))
        
        evals = OrderedDict()
        evals["Train"] = s_train
        evals["Test"] = s
        evals["Tr. Only"] = s_tr
        evals["Training Time"] = trat
        evals["Pred.Tra. Time"] = predtra
        evals["Testing Time"] = tet
        evals["Tr.Test Time"] = trt
        scores_tables_nn[i][k]["DeepSelect"] = evals
        
        scores_tables_nn[i][k]["LogisticRegression"] = eval_models(lr, train_mat, test_mat, tronly_mat, true_train, true, true_tr)
        scores_tables_nn[i][k]["MLP"] = eval_models(mlp, train_mat, test_mat, tronly_mat, true_train, true, true_tr)
        scores_tables_nn[i][k]["RandomForest"] = eval_models(rf, train_mat, test_mat, tronly_mat, true_train, true, true_tr)
        
        print()
        print(pd.DataFrame(scores_tables_nn[i][k]))
        print("took:", elapsed, "seconds\n")
        learning_curves[i][k] = training_curve
        k += 1
        print("*"*10+"\n")
    print("Average scores for trial {}".format(i))
    print(get_trial_score(scores_tables_nn[i]))
    print("-"*30)
print("%%"*20)
print("Average of {} trials".format(NUM_TRIALS))
print(get_total_average(scores_tables_nn))

Trial:	1
K: 	1
Epoch 0: 2.4471188724549733
Epoch 25: 0.006596818068616914
Epoch 50: 0.0031276120019916014
Epoch 75: 0.0021126713735146138
Epoch 100: 0.0016174126017723211
Epoch 125: 0.0013207504388533715
Epoch 150: 0.0011218209131420556
Took: 110.28929853439331 for training
Took: 284.2523398399353 for predicting 599 training instances
Took: 124.47869563102722 for predicting 301 test instances
Took: 44.85774111747742 for predicting 100 Turkish test instances





K:	1
                DeepSelect  LogisticRegression       MLP  RandomForest
Pred.Tra. Time  284.252340            0.108570  1.225991      0.115163
Test              0.816279            0.792691  0.791362      0.787708
Testing Time    124.478696            0.000349  0.000822      0.105977
Tr. Only          0.787143            0.788571  0.767143      0.762857
Tr.Test Time     44.857741            0.000164  0.000372      0.105610
Train             0.829716            0.831052  0.881970      0.999499
Training Time   110.289299            0.000960  0.001627      0.107306
took: 564.4611206054688 seconds

**********

K: 	2
Epoch 0: 1.9702802376131876
Epoch 25: 0.003982249622973389
Epoch 50: 0.0020096824128224473
Epoch 75: 0.0013711657275957616
Epoch 100: 0.0010502235254951605
Epoch 125: 0.0008555901006197549
Epoch 150: 0.0007243417852619416
Took: 112.15304160118103 for training
Took: 302.92478942871094 for predicting 600 training instances
Took: 150.4370892047882 for predicting 300 test inst




K:	2
                DeepSelect  LogisticRegression       MLP  RandomForest
Pred.Tra. Time  302.924789            0.112297  0.930947      0.110616
Test              0.818667            0.806333  0.804333      0.797333
Testing Time    150.437089            0.000332  0.000830      0.106021
Tr. Only          0.787143            0.774286  0.768571      0.755714
Tr.Test Time     51.687696            0.000129  0.000377      0.105570
Train             0.828500            0.823667  0.889833      0.997167
Training Time   112.153042            0.000661  0.001640      0.103784
took: 617.757826089859 seconds

**********

K: 	3
Epoch 0: 1.7132777045612988
Epoch 25: 0.003901430707642604
Epoch 50: 0.001965758791003719
Epoch 75: 0.001346215491070591
Epoch 100: 0.0010346692861813319
Epoch 125: 0.0008454406521583941
Epoch 150: 0.0007176313742450183
Took: 119.78052258491516 for training
Took: 306.9044768810272 for predicting 601 training instances
Took: 157.57751441001892 for predicting 299 test instanc

In [43]:
pickle.dump(scores_tables_nn, open("incremental_tf_tables.results", "wb"))

In [None]:
# HOW TO CHEAT LIKE A PRO
# """
# def test_selective(df_test, W1, W2, W3):
#     reset_graph()
#     x = tf.placeholder(tf.float32, [None, 300])
#     y = tf.placeholder(tf.float32, [None, 10]) # 1-10 => 10 classes

#     w1 = tf.placeholder(tf.float32, [300, 300])
#     w2 = tf.placeholder(tf.float32, [300, 300])
#     w3 = tf.placeholder(tf.float32, [300, 10])

#     b1 = tf.Variable(tf.zeros([300]))
#     b2 = tf.Variable(tf.zeros([300]))
#     b3 = tf.Variable(tf.zeros([10]))

#     l2 = tf.nn.sigmoid(tf.matmul(x, w1) + b1)
#     l3 = tf.nn.sigmoid(tf.matmul(l2, w2) + b2)
#     pred = tf.nn.softmax(tf.matmul(l3, w3) + b3)
    
#     correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
#     instance_accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
#     with tf.Session() as sess:
#         sess.run(tf.global_variables_initializer())
    
#         # Testing the model
#         LSMR_test = preprocess_data(df_test)
#         X_test, y_test = get_test(LSMR_test)
#         accuracy = 0.
#         for i in range(len(X_test)):
#             best_instance_accuracy = float("-inf")
#             for language, score, movie_id in W1:
#                 w_1 = W1[(language, score, movie_id)]
#                 w_2 = W2[(language, score)]
#                 w_3 = W3[score]
#                 a = instance_accuracy.eval({x: np.atleast_2d(X_test[i]), y: np.atleast_2d(y_test[i]),
#                                    w1:w_1,
#                                    w2:w_2,
#                                    w3:w_3})
#                 if a > best_instance_accuracy:
#                     best_instance_accuracy = a
#             accuracy += best_instance_accuracy

#     return accuracy/len(X_test)
# """

# 3-layer NN > needs at least 3 days for training

In [None]:
# gpu is a must
def train_deep(df_train, epochs=100, learning_rate=0.1, random_state=42):
    LSMR_train = preprocess_data(df_train)
    np.random.seed(random_state)
    data_dict, L1, L2, L3 = get_data_dict(LSMR_train, get_L2and3=True)
    init_weights = lambda layer, i, o: {k:2*np.random.random((i, o))-1 for k in layer}
    W1 = init_weights(L1, 300, 300)  # (languge, score, movie_id)
    W2 = init_weights(L2, 300, 300)  # (languge, score):
    W3 = init_weights(L3, 300, 10)  # score:
    
    
    reset_graph()
    x = tf.placeholder(tf.float32, [None, 300])
    y = tf.placeholder(tf.float32, [None, 10]) # 1-10 => 10 classes

    w1 = tf.Variable(tf.zeros([300, 300]))
    w2 = tf.Variable(tf.zeros([300, 300]))
    w3 = tf.Variable(tf.zeros([300, 10]))

    b1 = tf.Variable(tf.zeros([300]))
    b2 = tf.Variable(tf.zeros([300]))
    b3 = tf.Variable(tf.zeros([10]))

    l2 = tf.nn.sigmoid(tf.matmul(x, w1) + b1)
    l3 = tf.nn.sigmoid(tf.matmul(l2, w2) + b2)
    pred = tf.nn.softmax(tf.matmul(l3, w3) + b3)


    cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    training_curve = dict()
    with tf.device('/job:localhost/replica:0/task:0/device:GPU:0'):
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
            sess.run(tf.global_variables_initializer())
            for e in range(epochs+1):
                start = time.time()
                avg_cost = 0.
                for _, row in LSMR_train.iterrows():
                    score = row["Score"]
                    y_ = np.zeros(10)
                    y_[score-1] = 1
                    y_ = np.atleast_2d(y_)
                    x_ = np.atleast_2d(row["rev_vec"])
                    w_1, w_2, w_3 , _, c = sess.run([w1, w2, w3, optimizer, cost], feed_dict={x: x_,y: y_})               
                    avg_cost += c
                avg_cost /= len(LSMR_train)
                training_curve[e] = (avg_cost, time.time()-start)
                if e%10==0:
                    print("Epoch {}: {}".format(e, avg_cost))

    return w_1, w_2, w_3, training_curve

In [None]:
def test_deep(df_test, w_1, w_2, w_3):
    reset_graph()
    x = tf.placeholder(tf.float32, [None, 300])
    y = tf.placeholder(tf.float32, [None, 10]) # 1-10 => 10 classes

    w1 = tf.placeholder(tf.float32, [300, 300])
    w2 = tf.placeholder(tf.float32, [300, 300])
    w3 = tf.placeholder(tf.float32, [300, 10])

    b1 = tf.Variable(tf.zeros([300]))
    b2 = tf.Variable(tf.zeros([300]))
    b3 = tf.Variable(tf.zeros([10]))

    l2 = tf.nn.sigmoid(tf.matmul(x, w1) + b1)
    l3 = tf.nn.sigmoid(tf.matmul(l2, w2) + b2)
    pred = tf.nn.softmax(tf.matmul(l3, w3) + b3)
    
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    with tf.device('/job:localhost/replica:0/task:0/device:GPU:0'):
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
            sess.run(tf.global_variables_initializer())

            # Testing the model
            LSMR_test = preprocess_data(df_test)
            X_test, y_test = get_test(LSMR_test)
            return accuracy.eval({x: X_test,
                                  y: y_test,
                                  w1:w_1,w2:w_2,
                                  w3:w_3})

In [None]:
NUM_TRIALS = 1
scores_incremental = dict()
learning_curves = dict()
for i in range(NUM_TRIALS):
    scores_incremental[i] = dict()
    learning_curves[i] = dict()
    print("Trial:\t{}".format(i+1))
    k = 0
    skf = StratifiedKFold(n_splits=10, random_state=i)
    for train_index, test_index in skf.split(df["Review"], df["Language"]):
        start = time.time()
        w1, w2, w3, learning_curve = train_deep(df.loc[train_index], random_state=i, epochs=10000)
        s = test_deep(df.loc[test_index], w1, w2, w3)
        k += 1
        print("K:\t{}\nScore:\t{}".format(k, s))
        print("took:", time.time()-start)
        scores_incremental[i][k] = s
        learning_curves[i][k] = learning_curve
    print("*"*10)
    try:
        print("Trial {} avg score:\t {}".format(i+1, np.mean(list(scores_incremental[i].values()))))
    except:
        continue
    print("-"*30)