Imports

In [2]:
import numpy as np
import pandas as pd
from macbook.lib.utils import preprocess, preprocess_for_bert, build_glove_embeddings, save_embeddings, save_other_features, log_transform
from discovery.utils import load_features_stacked, load_feature_name_2_idx, get_sorted_mi
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

[nltk_data] Downloading package wordnet to /Users/pickle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pickle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to /Users/pickle/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [5]:
job = 'cucumber'
job_prev = 'pickle'
path = './data/'+job
if not os.path.isdir(path): os.mkdir(path)
for subdirectory in ['macbook', 'discovery']:
    path = './'+subdirectory+'/data/'+job
    if subdirectory == 'discovery':
        if not os.path.isdir(path+'/macbook'): os.mkdir(path+'/macbook')
        if not os.path.isdir(path+'/colab'): os.mkdir(path+'/colab')
for split in ['train', 'dev']:
    path = './colab/new-mt-dnn/data_complex/lcp_'+split+'.tsv'
    if not os.path.isfile(path):
        dummy = pd.DataFrame([[0, 0.0, 'This is a pretty easy sentence.', 'easy']])
        dummy.to_csv(path, sep='\t', index=False, header=False, encoding='utf-8')

# Load new data

In [6]:
data = pd.DataFrame([['This is a pretty easy sentence.', 'easy']], columns=['sentence', 'token'])
mode = 'single'

In [4]:
%cd ./macbook/
preprocess(data, './data/'+job+'/'+mode+'_data_p.tsv')
preprocess_for_bert(data, job, mode+'_data', do_round=False)
config = {
    'glove_path': './lib/glove/glove.6B.300d.txt',
    'glove_lower': True,
    'disambiguate': True,
    'infersent_V': 1,
    'infersent_MODEL_PATH': './lib/encoder/infersent%s.pkl',
    'infersent_W2V_PATH': './lib/glove/glove.6B.300d.txt',
    'infersent_lower': True,
    'options_file': './lib/ELMo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json',
    'weight_file': './lib/ELMo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
}
glove_embeddings = build_glove_embeddings(config)
save_embeddings(data, glove_embeddings, config, './data/'+job+'/'+mode+'_data_d.tsv', multi=mode=='multi')
save_other_features(data, None, config, './data/'+job+'/'+mode+'_data_o.tsv', multi=mode=='multi')
log_transform(data, 'all', './data/'+job+'/'+mode+'_data_o.tsv')
%cd ../

995it [00:00, 9948.52it/s]/Users/pickle/cs99/BigGreen-at-LCP-2021/macbook
400000it [00:28, 14247.69it/s]
Found 7(/9) words with w2v vectors
Vocab size : 7
100%|██████████| 1/1 [00:00<00:00,  3.46it/s]
100%|██████████| 34/34 [00:00<00:00, 160.68it/s]/Users/pickle/cs99/BigGreen-at-LCP-2021



In [None]:
%cd discovery/
# TODO:
%cd ../

In [29]:
!cp "./macbook/data/{job}/{mode}_data_bert.tsv" "./colab/new-mt-dnn/data_complex/lcp_test.tsv"
!cp "./macbook/data/{job}/{mode}_data_o.tsv" "./discovery/data/{job}/macbook/{mode}_data_o.tsv"
%cd ./colab/new-mt-dnn/
!python prepro_std.py --model bert-base-cased --root_dir data_complex/ --task_def data_complex/lcp.yml
if mode=='multi':
    !python predict.py --task_def "data_complex/lcp.yml" --task lcp --task_id 0 --prep_input "data_complex/bert_base_cased/lcp_test.json" --score "../../discovery/data/{job}/colab/{mode}_data_bert_scores.json" --checkpoint "checkpoints/bert-cased_lcp-single_2021-01-19T0332/model_3.pt"
else:
    !python predict.py --task_def "data_complex/lcp.yml" --task lcp --task_id 0 --prep_input "data_complex/bert_base_cased/lcp_test.json" --score "../../discovery/data/{job}/colab/{mode}_data_bert_scores.json" --checkpoint "checkpoints/bert-cased_lcp-single_2021-01-19T0309/model_4.pt"
%cd ../../

/Users/pickle/cs99/BigGreen-at-LCP-2021/colab/new-mt-dnn
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
03/28/2021 05:53:43 Task lcp
03/28/2021 05:53:43 data_complex/bert_base_cased/lcp_train.json
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
03/28/2021 05:53:43 data_complex/bert_base_cased/lcp_dev.json
03/28/2021 05:53:43 data_complex/bert_base_cased/lcp_test.json
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Loaded 1 samples out of 1
/Users/pickle/cs99/BigGreen-at-LCP-2021


# Train model

In [46]:
%cd ./discovery/
single_train = pd.read_csv('./data/'+job_prev+'/single_train_j.tsv', sep='\t', index_col=0)
X = load_features_stacked(job_prev, 'single_train')
y = single_train['complexity'].to_numpy()
single_train_feature_name_2_idx = load_feature_name_2_idx(job_prev, 'single_train')
single_train_feature_idx_2_name = {idx: feature_name for feature_name, idx in single_train_feature_name_2_idx.items()}
feature_names, mi = get_sorted_mi('./data/'+job_prev+'/single_train_mi.txt')
picks = feature_names[:300]
picks = [pick for pick in picks if X[:, single_train_feature_name_2_idx[pick]].std() != 0]
col_idx = np.array([single_train_feature_name_2_idx[pick] for pick in picks])
selector = VarianceThreshold(threshold=0.01)  # 0.1 indicates 99% of observations approximately
_ = selector.fit(X[:, col_idx])  # fit finds the features with low variance
picks = [pick for i, pick in enumerate(picks) if selector.get_support()[i]]
col_idx = np.array([single_train_feature_name_2_idx[pick] for pick in picks])
X_picks = X[:, col_idx]
scaler = StandardScaler()
X_picks_scaled = scaler.fit_transform(X_picks)
model = XGBRegressor(colsample_bytree=0.7,learning_rate=0.03,max_depth=5,min_child_weight=4,n_estimators=225,nthread=4,objective='reg:linear',silent=1,subsample=0.7)
model = model.fit(X_picks_scaled, y)
%cd ../

/Users/pickle/cs99/BigGreen-at-LCP-2021/discovery
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


/Users/pickle/cs99/BigGreen-at-LCP-2021


# Make predictions