# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [None]:
import pandas as pd
import numpy as np
import sys
import tensorflow as tf
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
sys.path.append("..")

In [None]:
from src.features.build_features import build_features
from src.features.build_features import _get_data
from src.deploy.train_sherlock import train_sherlock
from src.deploy.predict_sherlock import predict_sherlock

## Download data; the original raw values and preprocessed

In [None]:
_get_data()

### Get raw data values (skip these steps if you want to take the preprocessed data, then scroll down)

In [3]:
train_vals = pd.read_parquet('../data/raw/train_values.parquet')

In [None]:
train_labs = pd.read_parquet('../data/raw/train_labels.parquet')

In [None]:
val_vals = pd.read_parquet('../data/raw/val_values.parquet')
val_labs = pd.read_parquet('../data/raw/val_labels.parquet')

In [21]:
test_vals = pd.read_parquet('../data/raw/test_values.parquet')
test_labs = pd.read_parquet('../data/raw/test_labels.parquet')

In [10]:
train_vals.head()

Unnamed: 0,values
20368,"['Central Missouri', 'unattached', 'unattached..."
664102,"[95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ..."
366813,"['Katie Crews', 'Christian Hiraldo', 'Alex Est..."
530567,"['Christian', 'Non-Christian', 'Unreported', '..."
176253,"['AAF-McQuay Canada Inc.', 'AAF-McQuay Canada ..."


In [11]:
train_labs.head()

Unnamed: 0,type
20368,affiliation
664102,weight
366813,jockey
530567,religion
176253,company


## Extract features

In [4]:
train_vals = train_vals['values'].apply(literal_eval)
val_vals = val_vals['values'].apply(literal_eval)
test_vals = test_vals['values'].apply(literal_eval)

In [5]:
train_vals.head()

55030                    [Global, United States, Australia]
167000    [Fiction, Adult - Non-Floating, Fiction, Adult...
638282    [, , University of Puerto Rico - Rio Piedras, ...
232298    [Laughology, MTV, With Intent to Kill, Comedy ...
316158    [Mare, Gelding, Gelding, Gelding, Gelding, Mar...
Name: values, dtype: object

In [None]:
X_train = build_features(train_vals)

Preparing feature extraction by downloading 3 files: 
 ../src/features/glove.6B.50d.txt, 
 ../src/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy and 
 ../data_fake/data_fake.zip.
GloVe word embedding vectors were downloaded.
Trained paragraph vector model was downloaded.
Downloading data directory.
Downloading 1-caOvAP5IB_QMqw4Jx5RO4xOsFliEC3Q into ../data_fake/data_fake.zip... 
0.0 B Done.
Unzipping...Done.
Data was downloaded.
Extracting features for data column  100
Extracting features for data column  200
Extracting features for data column  300
Extracting features for data column  400
Extracting features for data column  500
Extracting features for data column  600
Extracting features for data column  700
Extracting features for data column  800
Extracting features for data column  900
Extracting features for data column  1000
Extracting features for data column  1100
Extracting features for data column  1200
Extracting features for data column  1300
Extracting features

In [None]:
X_train.to_parquet('X_train_13.parquet')

In [None]:
y_train = train_labs.values.flatten()

In [None]:
X_val = build_features(val_labs)
X_test = build_features(test_labs)
y_val = val_labs.values.flatten()
y_test = test_labs.values.flatten()

## Impute NaN values with feature means

In [None]:
pd.DataFrame(X_train.mean()).transpose().to_csv('train_column_means.csv')

In [150]:
train_columns_means = pd.read_csv('train_column_means.csv', index_col=0)

In [154]:
X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_val.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

### Train sherlock
Don't retrain with `nn_id='sherlock'` to avoid overwriting the original Sherlock model

In [None]:
train_sherlock(X_train, y_train, X_val, y_val, nn_id='retrain_sherlock');
print('Trained and saved new model.')

### Generate predictions with the retrained model

In [None]:
predicted_labels = predict_sherlock(X_test, nn_id='retrain_sherlock')
print('Predicted labels: ', predicted_labels, 'true labels: ', y_test)

In [None]:
f1_score(y_test, predicted_labels, average='weighted')

## Generate predictions with preprocessed data using Sherlock

In [20]:
X_test_preprocessed = pd.read_parquet('../data/processed/X_test.parquet')
y_test_preprocessed = pd.read_parquet('../data/processed/y_test.parquet').reset_index(drop=True)

In [21]:
X_test_preprocessed.head()

Unnamed: 0,col_entropy,frac_unique,frac_numcells,frac_textcells,avg_num_cells,std_num_cells,avg_text_cells,std_text_cells,avg_spec_cells,std_spec_cells,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,2.122181,0.005,0.0,1.0,0.0,0.0,12.29,5.077194,0.0,0.0,...,0.023563,-0.029472,0.002835,0.090851,-0.125505,-0.027747,0.028412,-0.078901,0.054292,-0.049115
1,3.817487,0.015,1.0,0.0,2.058,0.233743,0.0,0.0,0.0,0.0,...,0.244085,-0.055574,0.0176,0.079978,-0.014825,0.006086,0.121871,-0.078689,-0.069111,-0.11255
2,3.166061,0.009,0.12,1.0,0.12,0.324962,11.527,2.68873,0.0,0.0,...,0.018266,-0.088117,-0.048036,-0.011286,-0.109643,-0.070223,-0.009666,-0.081991,-0.041528,-0.094458
3,2.316887,0.005,0.0,1.0,0.0,0.0,9.053,1.960151,0.0,0.0,...,-0.063415,-0.000197,0.01202,-0.033859,0.063092,0.075499,-0.009511,-0.070606,0.061907,0.065065
4,6.955528,0.163,0.018,1.0,0.072,0.531804,20.268,9.593132,0.0,0.0,...,0.015399,-0.213604,0.0291,-0.009626,-0.154028,-0.09047,-0.01395,0.036592,-0.139673,-0.11543


In [22]:
y_test_preprocessed.head()

Unnamed: 0_level_0,label
index,Unnamed: 1_level_1
511600,affiliation
146358,weight
665579,jockey
148486,religion
3546,company


Testing Sherlock with other than the preprocessed data files might yield inconsistent results due to a changed feature extraction pipeline. <br> 
The model will be retrained with data consistent with the new feature extraction pipeline soon. <br>
For now the preprocessed train, validation and test data can be used to reproduce the results.

In [None]:
predicted_labels = predict_sherlock(X_test_preprocessed, 'sherlock')
print('Predicted labels: ', predicted_labels, 'true labels: ', y_test_preprocessed)

In [None]:
f1_score(y_test_preprocessed, predicted_labels, average='weighted')