In [2]:
%matplotlib inline

import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np

import json
import os
import glob
import datetime

# Essay scoring

## Development set

In [3]:
# Gather all params.
results = []

for d in glob.glob("../output/*"):
    if not os.path.exists(os.path.join(d, "param.txt")):
        continue
    
    prm = dict([p.strip().split("=", 1) for p in open(os.path.join(d, "param.txt"))])
    prm["Directory"] = os.path.basename(d)
    prm["Time"] = str(datetime.datetime.fromtimestamp(os.stat(d).st_mtime))[:19]
    prm["Fold"] = [i for i in range(5) if os.path.exists(os.path.join(d, "regression_f{}.hdf5".format(i)))]
    prm["Eval"] = [i for i in range(5) if os.path.exists(os.path.join(d, "prediction_f{}.json".format(i)))]

    mse_pool = []
    
    for i in range(5):
        pjs = os.path.join(d, "prediction_f{}.json".format(i))
        
        if os.path.exists(pjs):
            pred = json.load(open(pjs))
            mse_pool += [pred["MSE"]]

    mse_pool = np.array(mse_pool)
    prm["MSEstd"] = mse_pool.std()    
    prm["MSE"] = mse_pool.mean()
    prm["MSEs"] = mse_pool
    
#     l = pickle.load(open("{}/logs.pickle".format(d), "rb"))
#     df = pd.DataFrame(l["logs_e"])
#     prm["R_loss"] = df["loss"].min()
#     prm["R_val_loss"] = df["val_loss"].min()
    
    results += [prm]

cols = "Time Fold Eval MSE MSEstd MSEs di_aware dropout emb_fix enc_fix pseq preenc Directory".split()

pd.set_option("display.max_colwidth", 200)
df = pd.DataFrame(results)
df = df[df["di_aware"] == "False"]
df[cols].sort_values(by="Time", ascending=False)

  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Time,Fold,Eval,MSE,MSEstd,MSEs,di_aware,dropout,emb_fix,enc_fix,pseq,preenc,Directory
7,2019-01-08 18:24:11,"[0, 1, 2, 3, 4]","[0, 1, 2, 3, 4]",0.188923,0.023066,"[0.1732169700966405, 0.17464527016437387, 0.16342075147891053, 0.21352521893260346, 0.21980865044014983]",False,0.7,True,True,True,output_enc/750570aed2d16633ecbe4237d2d95b71,fa37bf66f2563eec16382c1eac16a108
3,2019-01-08 17:32:24,[],[],,,[],False,0.7,True,True,True,output_enc/750570aed2d16633ecbe4237d2d95b71,d66f0a27964dd57535957583bec9c684
6,2019-01-08 17:26:40,[0],[0],0.173757,0.0,[0.1737570696140451],False,0.7,False,False,True,output_enc/750570aed2d16633ecbe4237d2d95b71,6f27e738794f9eb73bd73c0c868d4cb2
5,2019-01-08 17:08:49,[],[],,,[],False,0.7,False,False,True,output_enc/750570aed2d16633ecbe4237d2d95b71,2674fd3a66a463d7fd62804995734663
1,2019-01-08 17:06:19,[],[],,,[],False,0.7,False,False,True,output_enc/a87b827fa7c5151192542ecb2c3af4d2,e0ab2e7b43b37719666bb6e1b0e58e10
4,2018-12-28 22:27:22,[1],[1],0.273942,0.0,[0.27394229360393907],False,0.5,False,False,True,,6c41a1e4fe316bada6d0b2b14433374b
0,2018-12-28 22:22:51,[1],[1],0.299845,0.0,[0.2998450864722913],False,0.5,False,False,True,,1f17c78951114d7214f77372b62f4d0a
2,2018-12-28 22:19:10,[1],[1],0.317584,0.0,[0.31758360822801335],False,0.5,True,False,False,,4b51b59d853fff7dca91b9f13b017f86


## Test set

In [6]:
# Gather all results.
results = []

for d in glob.glob("../output/*"):
    if not os.path.exists(os.path.join(d, "prediction_f1.json")):
        continue
        
    prm = dict([p.strip().split("=", 1) for p in open(os.path.join(d, "param.txt"))])
    
    p = json.load(open(os.path.join(d, "prediction_f1.json")))
    prm["Directory"] = os.path.basename(d)
    prm["MSE"] = p["MSE"]
    
    results += [prm]
        
df = pd.DataFrame(results)
df[["Directory", "MSE", "preenc", "enc_fix", "emb_fix", "pretrained", "pseq"]]

Unnamed: 0,Directory,MSE,preenc,enc_fix,emb_fix,pretrained,pseq
0,1f17c78951114d7214f77372b62f4d0a,0.299845,,False,False,False,True
1,4b51b59d853fff7dca91b9f13b017f86,0.317584,,False,True,True,False
2,6c41a1e4fe316bada6d0b2b14433374b,0.273942,,False,False,False,True
3,fa37bf66f2563eec16382c1eac16a108,0.174645,output_enc/750570aed2d16633ecbe4237d2d95b71,True,True,False,True


# Encoder pretraining

In [26]:
# Gather all params.
results = []

for d in glob.glob("../output_enc/*"):
    if not os.path.exists(os.path.join(d, "param.txt")):
        continue
    
    prm = dict([p.strip().split("=", 1) for p in open(os.path.join(d, "param.txt"))])
    prm["Directory"] = os.path.basename(d)
    prm["Time"] = str(datetime.datetime.fromtimestamp(os.stat(d).st_mtime))[:19]
    
    l = pickle.load(open("{}/logs.pickle".format(d), "rb"))
    df = pd.DataFrame(l["logs_e"])
    prm["R_acc"] = df["acc"].max()
    prm["R_val_acc"] = df["val_acc"].max()
    
    results += [prm]

cols = "Time R_acc R_val_acc dropout emb_fix enc_fix shuf Directory".split()

pd.set_option("display.max_colwidth", 50)
df = pd.DataFrame(results)
df[cols].sort_values(by="Time", ascending=False)

Unnamed: 0,Time,R_acc,R_val_acc,dropout,emb_fix,enc_fix,shuf,Directory
3,2018-12-27 14:18:45,0.993037,0.903384,0.7,False,False,di,a87b827fa7c5151192542ecb2c3af4d2
5,2018-12-27 13:25:00,0.986483,0.825873,0.5,False,False,di,6abbf82fd461ebcd2ac61867427b2a1e
1,2018-12-27 13:02:47,0.996313,0.853712,0.3,False,False,di,da6836f961365a7b348281a00e48bc34
2,2018-12-27 11:37:36,0.9929,0.936681,,,,,clipnorm=5.0_dropout=0.7_emb_dim=50_emb_fix=Fa...
4,2018-12-27 11:32:34,0.902512,0.743996,,,,,clipnorm=5.0_dropout=0.7_emb_dim=50_emb_fix=Fa...
0,2018-12-27 11:32:27,0.925587,0.742904,,,,,clipnorm=5.0_dropout=0.5_emb_dim=50_emb_fix=Fa...


In [2]:
### Score_normalized
# TN16
CUDA_VISIBLE_DEVICES=0 python src/train.py \
    --fold 1 \
    --model-type nea --dropout 0.5 \
    --embedding-dim 50 --aggregation-LSTMdim 300 \
    --gradientclipnorm 5 --meanovertime \
    --pre-trained --fix-embedding

fold_0: MSE: 0.3643280911204178, MAE: 0.4622013795375824
fold_1: MSE: 0.3197095480935262, MAE: 0.39644437405600474
fold_2: MSE: 0.4364101866079787, MAE: 0.438516518369836   
fold_3: MSE: 0.32989512169457946, MAE: 0.4378773703503965
fold_4: MSE: 0.3643280911204178, MAE: 0.4622013795375824
            
MSE: 0.362

SyntaxError: invalid syntax (<ipython-input-2-e702d4088bd6>, line 3)

In [None]:
fold0: MSE: 0.19625852776829697 MAE: 0.3492366951704025
fold1: 

In [5]:
# Gather all results.
results = []

for d in glob.glob("../output/*"):
    if not os.path.exists(os.path.join(d, "prediction_f1.json")):
        continue
        
    prm = dict([p.strip().split("=", 1) for p in open(os.path.join(d, "param.txt"))])
    
    p = json.load(open(os.path.join(d, "prediction_f1.json")))
    prm["MSE"] = p["MSE"]
    
    results += [prm]
        
df = pd.DataFrame(results)
df[["MSE", "preenc", "enc_fix", "emb_fix", "pretrained", "pseq"]]

Unnamed: 0,MSE,preenc,enc_fix,emb_fix,pretrained,pseq
0,0.335248,,False,True,True,False
1,0.31971,,False,True,True,False


# Command repo

In [1]:
# Command for training:

# TN16
CUDA_VISIBLE_DEVICES=0 python src/train.py \
    --fold 1 \
    --model-type nea --dropout 0.5 \
    --embedding-dim 50 --aggregation-grudim 300 \
    --gradientclipnorm 5 --meanovertime \
    --pre-trained --fix-embedding

# TN16+PN10
CUDA_VISIBLE_DEVICES=0 python src/train.py \
    --fold 1 \
    --model-type nea --dropout 0.5 \
    --embedding-dim 50 --aggregation-grudim 100 \
    --gradientclipnorm 5 --meanovertime \
    --pre-trained --fix-embedding \
    --persing-seq --pseq-embedding-dim 16 --pseq-encoder-dim 64

# TN16+PN10+pretrain(di. shuffle, fixed)
CUDA_VISIBLE_DEVICES=0 python src/train.py \
    --fold 1 \
    --model-type nea --dropout 0.5 \
    --embedding-dim 50 --aggregation-grudim 100 \
    --gradientclipnorm 5 --meanovertime \
    --persing-seq --pseq-embedding-dim 16 --pseq-encoder-dim 64 \
    --fix-encoder --fix-embedding \
    --pretrained-encoder output_enc/a87b827fa7c5151192542ecb2c3af4d2

# TN16+PN10+pretrain(di. shuffle, not fixed)
CUDA_VISIBLE_DEVICES=0 python src/train.py \
    --fold 1 \
    --model-type nea --dropout 0.5 \
    --embedding-dim 50 --aggregation-grudim 100 \
    --gradientclipnorm 5 --meanovertime \
    --persing-seq --pseq-embedding-dim 16 --pseq-encoder-dim 64 \
    --pretrained-encoder output_enc/a87b827fa7c5151192542ecb2c3af4d2

# TN16+PN10+pretrain(sent. shuffle, fixed)
CUDA_VISIBLE_DEVICES=1 python src/train.py \
    --fold 1 \
    --model-type nea --dropout 0.5 \
    --embedding-dim 50 --aggregation-grudim 100 \
    --gradientclipnorm 5 --meanovertime \
    --persing-seq --pseq-embedding-dim 16 --pseq-encoder-dim 64 \
    --fix-encoder --fix-embedding \
    --pretrained-encoder output_enc/clipnorm=5.0_dropout=0.7_emb_dim=50_emb_fix=False_enc_fix=False_model_type=nea_mot=True_pretrained=False_shuf=sentence

# TN16+PN10+pretrain(sent. shuffle, not fixed)
CUDA_VISIBLE_DEVICES=1 python src/train.py \
    --fold 1 \
    --model-type nea --dropout 0.5 \
    --embedding-dim 50 --aggregation-grudim 100 \
    --gradientclipnorm 5 --meanovertime \
    --persing-seq --pseq-embedding-dim 16 --pseq-encoder-dim 64 \
    --pretrained-encoder output_enc/clipnorm=5.0_dropout=0.7_emb_dim=50_emb_fix=False_enc_fix=False_model_type=nea_mot=True_pretrained=False_shuf=sentence

SyntaxError: invalid syntax (<ipython-input-1-dba85e4206fa>, line 4)

In [None]:
# Command for sentence encoder pretraining:
CUDA_VISIBLE_DEVICES=1 python src/train_enc.py \
    --model-type nea --dropout 0.3 \
    --embedding-dim 50 --aggregation-grudim 100 \
    --gradientclipnorm 5 --meanovertime \
    --shuffle-type di

CUDA_VISIBLE_DEVICES=1 python src/train_enc.py \
    --model-type nea --dropout 0.3 \
    --embedding-dim 50 --aggregation-grudim 100 \
    --gradientclipnorm 5 --meanovertime \
    --shuffle-type sentence

In [None]:
# Command for evaluation
CUDA_VISIBLE_DEVICES=1 python src/eval.py \
    --fold 1 \
    --model-dir output/cbc428f99e04b33c2dcb221e7331e07d