### The following notebook uses the Keras wrapper library ktrain to build a BERT classifier 

ktrain : 
@article{maiya2020ktrain,
         title={ktrain: A Low-Code Library for Augmented Machine Learning},
         author={Arun S. Maiya},
         journal={arXiv},
         year={2020},
         volume={arXiv:2004.10703 [cs.LG]}
}

BERT : https://arxiv.org/abs/1810.04805

Keras : 
@misc{chollet2015keras,
  title={Keras},
  author={Chollet, Fran\c{c}ois and others},
  year={2015},
  howpublished={\url{https://keras.io}},
}

Sklearn (used only for train-test split): 
@article{scikit-learn,
 title={Scikit-learn: Machine Learning in {P}ython},
 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
 journal={Journal of Machine Learning Research},
 volume={12},
 pages={2825--2830},
 year={2011}
}

#### requirements :  
pip install ktrain

In [2]:
#to import ktrain use pip install ktrain
import ktrain
from ktrain import text
import pandas as pd
from proj1_helpers import create_csv_submission
from load_utils import load_tweets

In [4]:
#load full dataset
tweets=load_tweets(full=True)

loaded 2500000 tweets in dataframe with columns: Index(['text', 'label'], dtype='object')


In [18]:
#create dataframe with label column 'pos' or 'neg'
df_tweets = pd.DataFrame()
df_tweets['label'] = tweets['label'].replace(0,'neg')
df_tweets['label'] = df_tweets['label'].replace(1,'pos')

In [22]:
#verify data integrity
(df_tweets['label'] =='pos').sum() + (df_tweets['label'] =='neg').sum()

2500000

In [23]:
#add text column to dataframe
df_tweets['text'] = tweets.text

In [24]:
#apply bert preprocessing, we hand the label column so encoder knows how many classes there are
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(df_tweets, 'text', 'label', 
                                                                  maxlen=30, preprocess_mode='bert')

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [25]:
#verify data integrity
len(y_train)+len(y_test)

2500000

In [29]:
# load bert model with previously computed preprocessing
model = text.text_classifier('bert', (x_train, y_train), preproc=preproc)

# wrap model and data in ktrain.Learner object
learner = ktrain.get_learner(model, 
                             train_data=(x_train, y_train), 
                             val_data=(x_test, y_test), 
                             batch_size=128)
#fit 3 epochs with learning rate 2e-5
#chosen learning rate yields reasonable results, 
#was tuned using learner.lr_find( and learner.lr_plot())
learner.fit_onecycle(2e-5, 3) 

Is Multi-Label? False
maxlen is 30
done.


begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7efe8672d630>

In [32]:
# uncomment line below to save model and Preprocessor instance
ktrain.get_predictor(learner.model, preproc).save('bert_predictor2')

#### Results after 3 epoch seem good, we can already see that the model starts to overfit the training data, running another epoch below confirms this trend.

In [33]:
learner.fit_onecycle(2e-5, 1) 



begin training using onecycle policy with max lr of 2e-05...


<tensorflow.python.keras.callbacks.History at 0x7efcd1767f28>

In [35]:
#load previously saved model weights
model1 = ktrain.load_predictor('bert_predictor2').model
predictor1 = ktrain.get_predictor(model1, preproc)

#### Predict on test data

In [36]:
tweets= open("test_data.txt").readlines()

# create dataframe with positive tweets and "1" label
tweets_df = pd.DataFrame()
tweets_df['text'] = tweets

In [81]:
def strip_tweet(tweet):
    comma_idx = tweet.find(',')
    return tweet[comma_idx+1:]
tweets_df = pd.DataFrame()
tweets_df['text'] = [strip_tweet(t) for t in tweets]

In [82]:
tweets_df['y_pred'] = tweets_df.text.apply(predictor1.predict)

In [83]:
Y_pred = tweets_df['y_pred']
Y_pred = Y_pred.replace('pos',1)
Y_pred = Y_pred.replace('neg',-1)
Y_pred.index+=1

In [90]:
create_csv_submission(Y_pred.index, Y_pred.values,"output.csv")

In [91]:
#plot confusion matrix on validation set
predictor1.analyze_valid((x_test,y_test))

              precision    recall  f1-score   support

         neg       0.91      0.89      0.90    125278
         pos       0.89      0.91      0.90    124722

    accuracy                           0.90    250000
   macro avg       0.90      0.90      0.90    250000
weighted avg       0.90      0.90      0.90    250000



array([[111649,  13629],
       [ 11660, 113062]])