In [1]:
import pickle, pandas as pd, re, numpy as np, ast

from sklearn.model_selection import train_test_split

In [2]:
score_vect_dicts = pickle.load(open("score_vectors_dict", "rb"))
df = pd.read_csv("datasets/LMSR_rev2vec.csv")
df.head()

Unnamed: 0,Language,Movie_ID,Score,rev_vec
0,en,-800777728,9,[ 5.10346368e-02 3.13695297e-02 3.0434969...
1,en,-800777728,10,[ 5.85338362e-02 3.14969495e-02 9.2768417...
2,en,-1018312192,8,[ 5.58815002e-02 3.47756743e-02 -4.6352325...
3,en,-1018312192,4,[ 0.05872388 0.03473265 -0.01488676 0.101489...
4,en,-1018312192,7,[ 3.05793583e-02 3.45815420e-02 -6.1754882...


In [13]:
df["score_vec"] = df["Score"].apply(lambda x: score_vect_dicts[x] if x in score_vect_dicts else np.NaN)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Language,Movie_ID,Score,rev_vec,score_vec
0,en,-800777728,9,[ 5.10346368e-02 3.13695297e-02 3.0434969...,"[-0.279659865294, -0.0989524373099, 0.03580531..."
1,en,-800777728,10,[ 5.85338362e-02 3.14969495e-02 9.2768417...,"[-0.337921482265, -0.0845029055412, 0.01985886..."
2,en,-1018312192,8,[ 5.58815002e-02 3.47756743e-02 -4.6352325...,"[-0.392837140579, -0.13993286079, 0.0117605265..."
3,en,-1018312192,4,[ 0.05872388 0.03473265 -0.01488676 0.101489...,"[-0.619363193141, -0.294599547156, 0.014450649..."
4,en,-1018312192,7,[ 3.05793583e-02 3.45815420e-02 -6.1754882...,"[-0.418672884105, -0.170094622466, 0.089099188..."


In [12]:
def parse_np_array(array_string, as_nparray=True):
    pattern = r'''# Match (mandatory) whitespace between...
              (?<=\]) # ] and
              \s+
              (?= \[) # [, or
              |
              (?<=[^\[\]\s]) 
              \s+
              (?= [^\[\]\s]) # two non-bracket non-whitespace characters
           '''
    fixed_string = re.sub(pattern, ',', array_string, flags=re.VERBOSE)
    if as_nparray:
        return np.array(ast.literal_eval(fixed_string))
    return ast.literal_eval(fixed_string)

In [233]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, KFold
from sklearn.pipeline import Pipeline

from collections import defaultdict, OrderedDict
from itertools import chain, starmap
from itertools import product

In [191]:
X = np.zeros((len(df), 300))
Y = np.zeros((len(df), 300))
y = np.zeros((len(df)))
i = 0
for rev in df.iterrows():
    score = rev[1][2]
    rev_vec = parse_np_array(rev[1][3])
    score_vec = rev[1][4]
    
    X[i] = rev_vec
    Y[i] = score_vec
    y[i] = score
    
    i += 1

In [192]:
def build_pipeline(p):
    pipeline, pipeline_params = [], OrderedDict()
    
    for model, model_params in p:
        try:
            name = model.__name__
            pipeline.append((name, model()))
        except:
            name = model.estimator.__name__
            pipeline.append((name, model))
       
        pipeline_params.update({'{}__{}'.format(name, param_name) : values 
                                for param_name, values in model_params.items()})
    
    return Pipeline(pipeline), pipeline_params

In [197]:
regressors = [

    (MLPRegressor, {
        "hidden_layer_sizes":range(50, 300+1,50),
        "max_iter":range(100, 200+1, 25)
    }),
    (RandomForestRegressor, {
        "n_estimators": range(3, 10),
        "n_jobs":[-1],
    }),
    (LinearRegression, {
        "n_jobs":[-1]
    })
]
feature_selection = [
    (PCA, {
    'n_components': range(50, 150+1, 25)
  })
]

In [253]:
mlp_clf = MLPClassifier(random_state=42)
mlp_clf.fit(Y, y)
kf = KFold(n_splits=10)

In [254]:
# NUM_TRIALS = 10
# metrics = ['f1', 'recall', 'precision', 'accuracy']
# trials = []

# # for i in range(NUM_TRIALS):
# cv_pipelines = []
# inner_cv = KFold(n_splits=10, shuffle=True, random_state=42)
# outer_cv = KFold(n_splits=10, shuffle=True, random_state=42)

# for pipeline, params in map(build_pipeline, product(feature_selection, regressors)):
#     cv_pipeline = GridSearchCV(pipeline, params, cv=inner_cv, n_jobs=-1, verbose=1).fit(X, Y)
#     cv_pipelines.append(cv_pipeline)

# best_pipeline = cv_pipelines[np.argmax([i.best_score_ for i in cv_pipelines])]
# cv = cross_validate(best_pipeline.best_estimator_, 
#                     X=X, y=Y, cv=outer_cv, 
#                     scoring=metrics, 
#                     return_train_score=False)

# trials.append((best_pipeline, cv))
# print("{} trial done".format(i+1))
# print("-"*10)

In [255]:
def distance_accuracy(y_true, y_predict):
    res = 0
    for i in range(len(y_true)):
        res += abs(y_true[i]-y_predict[i])
    return 1-res/(len(y_true)*len(set(y_true)))

In [258]:
score_dict = {"distance_accuracy":0}
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    l_train, l_test = y[train_index], y[test_index]
    pca = PCA()
    pca.fit(X_train)
    pca.transform(X_train)
    mlp = MLPRegressor(random_state=42)
    mlp.fit(X_train, y_train)
    preds_score_vecs = lr.predict(X_test)
    pred_scores = knn.predict(preds_score_vecs)
    for metric in score_dict:
        score_dict[metric] += globals()[metric](l_test, pred_scores)
for metric in score_dict:
    score_dict[metric] /= 10.0

In [259]:
score_dict

{'distance_accuracy': 0.88336335496199436}

In [252]:
print(knn.predict(mlp.predict(np.atleast_2d(vp))))
print(knn.predict(mlp.predict(np.atleast_2d(vn))))

[ 8.]
[ 8.]


In [231]:
vn = np.array([ 0.024575  ,  0.0780872 ,  0.0218456 , -0.416978  ,  0.0418342 ,
       -0.26509828, -0.15126799, -0.1380056 ,  0.1373278 ,  0.213032  ,
       -0.021196  , -0.286086  , -0.062111  ,  0.158969  ,  0.1046596 ,
        0.0441594 , -0.1745858 ,  0.06308679, -0.0271644 ,  0.225724  ,
        0.07405599,  0.0317332 , -0.2219522 , -0.149866  ,  0.142036  ,
        0.00603   ,  0.2340944 ,  0.1475164 , -0.1298866 ,  0.10311399,
        0.27726799, -0.2231768 ,  0.1363482 , -0.0377694 ,  0.0211782 ,
       -0.0374892 ,  0.1913826 ,  0.55832402, -0.0291594 , -0.1916008 ,
       -0.137254  ,  0.2495355 , -0.159432  ,  0.0383394 ,  0.06307359,
       -0.06488302, -0.010813  ,  0.06447   , -0.43357179,  0.2884226 ,
       -0.006242  ,  0.31172919,  0.006465  , -0.202956  ,  0.13317999,
        0.234002  , -0.1704622 , -0.142286  ,  0.1297116 , -0.30379   ,
       -0.18723999, -0.0586302 ,  0.13263419,  0.15410801, -0.0545603 ,
        0.1699034 , -0.18423   , -0.082516  , -0.079438  ,  0.34552881,
       -0.2079218 ,  0.443968  ,  0.14478679, -0.1228022 ,  0.0741026 ,
        0.2452676 ,  0.2687516 , -0.4057148 , -0.0954918 ,  0.171634  ,
        0.2247296 , -0.116044  ,  0.59757199, -0.0844328 , -0.0348282 ,
        0.2154974 , -0.015546  , -0.2104446 ,  0.10083159,  0.0759356 ,
        0.29458   , -0.133753  ,  0.1162    ,  0.1378152 , -0.1707856 ,
       -0.085724  , -0.0289014 ,  0.338166  , -0.37039001, -0.0547254 ,
        0.0465862 , -0.0389332 , -0.1511948 , -0.2348892 ,  0.10468   ,
        0.1546    ,  0.0071447 , -0.36236481, -0.1022528 ,  0.22919001,
       -0.17852647,  0.1351988 , -0.27181799,  0.0673532 , -0.2173312 ,
       -0.3075472 ,  0.06701642,  0.267372  , -0.26892019,  0.101166  ,
        0.050804  ,  0.2443642 , -0.03797   ,  0.11411714,  0.0200308 ,
       -0.673707  ,  0.240564  , -0.032244  , -0.084962  ,  0.1343066 ,
        0.147628  , -0.1281278 , -0.03501   , -0.13466614, -0.091464  ,
       -0.2160354 , -0.34959999, -0.18217901, -0.02749   ,  0.147264  ,
       -0.1084934 , -0.00213201,  0.11428801,  0.1227576 , -0.02473298,
        0.113816  , -0.0390326 ,  0.54454201,  0.0396268 , -0.04223801,
        0.274098  ,  0.1014148 , -0.0485458 ,  0.1502744 , -0.048396  ,
        0.0828894 , -0.0649078 , -0.00495232, -0.0129226 ,  0.1339572 ,
       -0.1326338 , -0.0339584 ,  0.0275506 ,  0.085596  ,  0.394556  ,
       -0.00980564, -0.60479599, -0.075632  ,  0.1147774 , -0.3782834 ,
        0.028796  , -0.407134  , -0.045582  ,  0.226008  , -0.0245888 ,
        0.4154308 , -0.124188  ,  0.0456686 ,  0.272304  ,  0.06235399,
       -0.23105499,  0.2285066 , -0.1360454 , -0.193236  , -0.42464201,
       -0.1383896 , -0.20321819,  0.504094  ,  0.0874442 , -0.0654232 ,
        0.084188  ,  0.2607762 ,  0.06747678, -0.11143244,  0.13610542,
        0.0172552 , -0.0733664 , -0.040266  ,  0.082984  , -0.307465  ,
        0.08035204, -0.13782518,  0.263822  , -0.130928  , -0.265558  ,
        0.041311  , -0.249024  , -0.1387842 , -0.2848    , -0.2122302 ,
        0.0458996 , -0.22237   , -0.0132874 , -0.1249018 ,  0.39228599,
       -0.21706681, -0.21440177,  0.3210316 , -0.47030999, -0.33631383,
       -0.2332706 , -0.0390812 ,  0.08309986, -0.107676  ,  0.0093338 ,
       -0.0226178 ,  0.165766  ,  0.057472  , -0.0851216 ,  0.0241008 ,
        0.09513   ,  0.126258  , -0.139444  ,  0.211626  , -0.391036  ,
        0.1594172 , -0.0602718 , -0.24749739,  0.0504366 , -0.2312028 ,
        0.53387199, -0.19293864, -0.1390358 ,  0.2534996 , -0.2511718 ,
       -0.0130448 , -0.1585984 ,  0.12275732, -0.30499041,  0.03531948,
        0.11372   ,  0.09131801, -0.358166  ,  0.156076  , -0.1561332 ,
        0.0183022 ,  0.16250619,  0.045286  , -0.48574999,  0.1268312 ,
        0.064976  ,  0.1773104 ,  0.1984662 ,  0.1406944 , -0.2192226 ,
        0.103485  , -0.06491428, -0.0990716 ,  0.005734  ,  0.414386  ,
        0.1544144 , -0.191814  , -0.2675194 ,  0.0653194 ,  0.1829872 ,
        0.290616  , -0.008922  ,  0.2243556 ,  0.0697846 ,  0.07361981,
       -0.1629504 ,  0.17001   , -0.52244201, -0.0185994 ,  0.2284724 ,
       -0.0518902 ,  0.1932424 , -0.3130024 ,  0.0506188 ,  0.050459  ,
        0.29641   , -0.2854852 ,  0.000702  ,  0.0064622 , -0.36655999,
       -0.1739664 ,  0.01311908, -0.25799618,  0.129018  , -0.53156001])

In [223]:
vp = np.array([ -3.02941972e-02,  -8.59100111e-03,   4.35969993e-02,
        -2.22849201e-01,   1.22254001e-01,  -1.06640801e-01,
         1.34760812e-03,  -8.14858001e-02,   1.72767997e-02,
         1.15293998e-01,   1.73920065e-02,  -1.63480705e-01,
        -1.13961402e-01,   1.15037603e-01,   1.21960401e-01,
         4.15838003e-02,  -7.77953982e-02,  -6.54366042e-02,
         3.76957987e-02,   1.21770003e-01,   1.58715390e-02,
         4.89195943e-02,   1.99452013e-02,  -3.36503997e-01,
         1.16934001e-02,   1.76313998e-01,   1.55097196e-01,
         1.58909991e-02,   9.59161982e-02,  -5.03308028e-02,
         1.49623793e-01,  -6.66974042e-02,   1.59631262e-01,
        -9.77100046e-02,   9.48232024e-02,  -1.90816000e-02,
         4.18719370e-04,   2.33268005e-01,  -4.23953988e-02,
        -1.95430758e-01,  -2.04257934e-01,   1.84860102e-01,
         7.73687966e-02,  -9.36580002e-02,  -6.54597998e-02,
        -2.12840001e-01,   7.58451991e-02,   4.73899983e-02,
        -3.64982972e-01,   3.92242002e-01,  -1.26253341e-01,
         1.44718400e-01,  -9.11842000e-02,  -6.15880042e-02,
         3.53219971e-02,   5.27192026e-02,  -1.51871882e-01,
        -7.40377992e-02,   1.33347379e-01,  -1.81688800e-01,
        -1.49678596e-01,  -9.12774041e-02,  -9.50460583e-03,
         1.70257202e-01,  -1.44753997e-01,   6.11200184e-03,
        -1.82210001e-01,  -1.01643399e-01,   7.34320007e-02,
         2.55164408e-01,  -1.25014382e-01,   8.76006022e-02,
         1.27669797e-01,   2.66392007e-02,   1.05578603e-01,
         8.91500056e-02,   1.82932400e-01,  -3.71673995e-01,
        -4.30997986e-02,   1.18699200e-01,   5.81125978e-02,
         6.59593992e-02,   3.25065194e-01,   5.47260195e-03,
         1.22292000e-01,   7.01103985e-02,   7.75454196e-02,
        -2.03751989e-02,   1.58489595e-01,   5.15808001e-02,
        -2.43540019e-02,  -6.97748013e-02,   1.03765604e-01,
        -1.25674799e-01,  -1.28850801e-01,  -1.58620209e-01,
         1.05441963e-01,   2.85204399e-01,  -3.20115405e-01,
        -5.80465978e-02,   3.01855996e-02,   7.44797960e-02,
        -1.39411971e-02,  -2.11659598e-01,  -2.55691964e-02,
         1.01462799e-01,  -4.62768468e-02,  -1.12397605e-01,
         1.36114001e-02,  -3.62025987e-02,  -2.38461399e-01,
         2.83709988e-02,  -9.81211968e-02,   7.13587988e-02,
        -8.15757971e-02,  -2.86987804e-01,   1.97773999e-01,
         9.33279991e-02,  -2.29008194e-01,   8.62035990e-02,
         6.84594207e-02,   4.00867991e-02,   5.03083967e-02,
         1.81129947e-02,   8.12834021e-02,  -4.10608606e-01,
         2.68722002e-01,   2.48331999e-02,  -2.65882002e-01,
         1.68868003e-01,   2.62938000e-01,  -1.06025399e-01,
        -6.48609944e-02,  -1.61037799e-01,   6.46590026e-02,
        -1.34562597e-01,  -2.21677999e-01,   1.00724497e-01,
         7.02159740e-03,   9.17319804e-03,  -2.07966000e-02,
        -3.59799616e-02,   2.35068206e-01,  -3.89145970e-02,
         1.07817200e-01,   1.11474000e-01,  -1.32101204e-01,
         2.03666609e-01,  -1.09156597e-01,  -1.37965605e-01,
         1.67948003e-01,  -1.12996027e-02,  -1.15617201e-01,
         1.21682998e-01,   8.07360008e-02,   1.20547399e-01,
        -5.82441993e-02,  -1.04260057e-02,   1.32436007e-02,
         1.96243999e-01,  -1.50959599e-01,   2.09732004e-02,
         8.09341988e-02,   5.01667976e-02,   9.24072243e-02,
         5.53332814e-02,  -4.11905191e-01,  -1.33904599e-01,
         1.27767199e-01,  -2.31411402e-01,   1.49089199e-01,
        -1.67326797e-01,   5.85971985e-02,   1.79354400e-01,
        -1.90839995e-02,   2.89904399e-01,   8.04782014e-02,
         3.73840891e-04,   4.26323988e-02,   3.71357985e-02,
        -1.68309002e-01,   1.42635800e-01,  -7.14156006e-02,
        -1.43311399e-01,  -2.24270808e-01,  -2.32027989e-02,
        -1.69024394e-01,   3.70714599e-01,  -6.60784006e-02,
        -1.17824599e-01,   2.42806603e-01,   1.63728006e-02,
         7.11437999e-03,  -2.40348026e-02,  -1.19598024e-02,
         3.91033992e-02,  -8.98745969e-02,  -1.01110402e-01,
        -4.57934779e-02,  -2.45543065e-01,   1.43499441e-01,
         1.87253090e-02,   1.33474001e-01,  -2.54519612e-02,
        -3.14360075e-03,   7.02473976e-02,  -1.26675203e-01,
        -1.02920400e-01,  -1.69724999e-01,  -1.46552026e-02,
        -8.26387942e-02,  -4.73771989e-02,  -9.38656025e-02,
        -8.79024003e-02,   2.53689995e-01,  -6.36480026e-02,
        -2.64442006e-01,   3.38723802e-01,  -1.93343993e-01,
        -2.48374401e-01,   8.98940042e-03,   9.22016054e-02,
         6.99038606e-02,  -9.31808025e-02,  -4.87245996e-02,
        -8.00629981e-02,   1.11128004e-01,  -2.38869973e-02,
         3.58299864e-03,   1.48677940e-01,   4.77210049e-02,
         1.24727000e-01,  -1.38971798e-01,   1.13520003e-01,
        -3.22983998e-01,   8.33641980e-02,  -1.03348602e-01,
        -1.58010796e-01,  -9.92539994e-02,  -2.71680202e-01,
         1.16012198e-01,  -2.00044398e-01,  -9.03685991e-02,
         1.12554805e-01,   6.23582035e-02,   4.44129992e-02,
        -1.70414004e-01,   1.58297000e-01,  -1.89169202e-01,
         4.01613386e-02,  -2.52391998e-02,  -3.58281970e-02,
        -3.01191998e-01,   2.49625406e-01,  -3.36730003e-01,
         1.44854401e-01,   2.25159992e-01,   1.51213999e-01,
        -4.21675998e-01,   1.14360196e-01,   2.76694000e-02,
         1.71860757e-01,   7.67917994e-02,   1.59523421e-01,
        -2.57122804e-01,   2.66726004e-02,   1.58339021e-02,
        -7.17266008e-02,  -5.74267991e-02,   1.63990400e-01,
         1.01509798e-01,  -1.53264996e-01,  -1.47058201e-01,
         1.55562998e-01,   2.11387996e-01,   4.08735994e-01,
         6.40900105e-02,   1.42989001e-01,   2.06137206e-01,
         2.38131724e-01,  -2.16013598e-01,   1.74998404e-01,
        -2.16620004e-01,  -7.58000314e-03,   1.96230003e-01,
        -3.66898011e-02,   1.10754202e-01,  -1.73872402e-01,
        -1.70517981e-02,  -8.94763991e-02,   3.04676002e-01,
        -1.01115800e-01,  -6.57439902e-03,   4.66485992e-02,
        -3.00552399e-01,  -1.47147403e-01,   3.56325399e-02,
        -1.14748199e-01,   3.33572075e-02,  -3.25230007e-01])

In [38]:
preds_score_vecs = lr.predict(X_test)
pred_scors = knn.predict(preds_score_vecs)

In [20]:
X_ = list()
Y_ = list()
y = list()
for rev in df.iterrows():
    try:
        score = rev[1][2]
        vec = parse_np_array(rev[1][-1])
        Y_.append(score_vect_dicts[score])
        y.append(score)
    except:
        continue
    X_.append(vec)
X = np.array(X_)
Y = np.array(Y_)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
X_train = pad_sequences(X_train, maxlen=300, value=0.)
X_test = pad_sequences(X_test, maxlen=300, value=0.)

In [21]:
X_train2, X_test2, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train2 = pad_sequences(X_train2, maxlen=300, value=0.)
X_test2 = pad_sequences(X_test2, maxlen=300, value=0.)

In [22]:
X_train2.shape, X_test2.shape

((688, 300), (296, 300))

In [24]:
with tf.device("/gpu:0"):
    net = tflearn.input_data([None, 300])
    net = tflearn.embedding(net, input_dim=1000, output_dim=300)
    net = tflearn.lstm(net, 300, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001)
    
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(X_train, Y_train, validation_set(X_test, Y_test), show_metric=True, patch_size=64)

InvalidArgumentError: Node 'init_19/NoOp': Unknown input node '^is_training/Assign'

In [None]:
net = tflearn.input_data([None, 300])
net = tflearn.embedding(net, input_dim=1000, output_dim=300)
net = tflearn.lstm(net, 300, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001)

In [None]:
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(X_train2, y_train, validation_set(X_test, y_test), show_metric=True, patch_size=64)