## Read data and setup model

In [1]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from sklearn.metrics import classification_report

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd
import tensorflow as tf

In [3]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


_____________________________

In [4]:
URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

In [5]:
# The shutil module offers a number of high-level 
# operations on files and collections of files.
import os
import shutil
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

['labeledBow.feat', 'neg', 'pos', 'unsupBow.feat', 'urls_unsup.txt', 'urls_pos.txt', 'urls_neg.txt']


In [6]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=123)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [7]:
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")
train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,Canadian director Vincenzo Natali took the art...,1
1,I gave this film 10 not because it is a superb...,1
2,I admit to being somewhat jaded about the movi...,1
3,"For a long time, 'The Menagerie' was my favori...",1
4,A truly frightening film. Feels as if it were ...,0


In [8]:
for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I can't believe that so much talent can be was...,0
1,This movie blows - let's get that straight rig...,0
2,"The saddest thing about this ""tribute"" is that...",0
3,I'm only rating this film as a 3 out of pity b...,0
4,Something surprised me about this movie - it w...,1


### Creating Input Sequences

In [9]:
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

InputExample(guid=None, text_a='Hello, world', text_b=None, label=1)

In [10]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples

#   train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
#                                                                            test, 
#                                                                            'DATA_COLUMN', 
#                                                                            'LABEL_COLUMN')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [11]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [12]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f6bad124150>

In [13]:
pred_sentences = ['This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good',
                  'One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie']

In [14]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = [1,0]
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good : 
 0
One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie : 
 1


In [20]:
model.save_pretrained("model_trained_on_imdb_only_colab")

Load our data and fine tune

In [17]:
def make_predictions(model, tokenizer, pred_sentences, batch_size = 1000):
  predictions = []

  for ii, sent_batch in enumerate(batch(pred_sentences, batch_size)):
    print('Batch', ii)
    tf_batch = tokenizer(sent_batch, max_length=128, padding=True, truncation=True, return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    labels = [0, 1]
    label = tf.argmax(tf_predictions, axis=1)
    label = label.numpy()

    predictions.extend([labels[label[i]] for i in range(len(sent_batch))])
  # for i in range(len(pred_sentences)):
  #   print(pred_sentences[i], ": \n", labels[label[i]])
  print(len(predictions), 'predictions made')
  return predictions

In [16]:
from sklearn.model_selection import train_test_split
import re
import numpy as np


def batch(iterable, n = 1):
	current_batch = []
	for item in iterable:
	    current_batch.append(item)
	    if len(current_batch) == n:
	        yield current_batch
	        current_batch = []
	if current_batch:
	    yield current_batch

def deEmojify(text):
  regrex_pattern = re.compile(pattern = "["
      u"\U0001F600-\U0001F64F"  # emoticons
      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
      u"\U0001F680-\U0001F6FF"  # transport & map symbols
      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
      u"\U00002500-\U00002BEF"  # chinese char
      u"\U00002702-\U000027B0"
      u"\U00002702-\U000027B0"
      u"\U000024C2-\U0001F251"
      u"\U0001f926-\U0001f937"
      u"\U00010000-\U0010ffff"
      u"\u2640-\u2642" 
      u"\u2600-\u2B55"
      u"\u200d"
      u"\u23cf"
      u"\u23e9"
      u"\u231a"
      u"\ufe0f"  # dingbats
      u"\u3030"
                          "]+", flags = re.UNICODE)
  return regrex_pattern.sub(r'',text)
    

training_data = pd.read_csv('Training_Dataset.csv')
training_data['inverted_label'] = np.where(training_data.label.values==0,1,0)
training_data['text_no_emoji'] = [deEmojify(x) for x in training_data.text.values]
training_data['char_length'] = training_data.text_no_emoji.str.len()

# Split train and validation data
train_df, validation_df = train_test_split(training_data, test_size=0.25)


competition_test = pd.read_csv('Test_Dataset.csv')
competition_test['text_no_emoji'] = [deEmojify(x) for x in competition_test.text.values]
# competition_test['text'] = competition_test['text'].str.decode("utf-8")
training_data.head(42)

Unnamed: 0,ID,text,label,inverted_label,text_no_emoji,char_length
0,18822638,"I should have known better, its obviously not ...",0,1,"I should have known better, its obviously not ...",94
1,9f9f1c3f,"I am happy for you. No, seriously, I am.",0,1,"I am happy for you. No, seriously, I am.",40
2,563242ec,Everyone including an ambassador has freedom o...,0,1,Everyone including an ambassador has freedom o...,73
3,ceda51ca,Oh really?,0,1,Oh really?,10
4,bfb0c83d,"Thanks for reply, I'll enjoy reading through t...",0,1,"Thanks for reply, I'll enjoy reading through t...",94
5,3f49cfe8,Threatening to jail disadvantaged families is ...,0,1,Threatening to jail disadvantaged families is ...,77
6,ca196d3e,Is her fake chin totally to the right of where...,0,1,Is her fake chin totally to the right of where...,65
7,f766effa,My man!,0,1,My man!,7
8,2688d61b,Is that a little poot at :09?,0,1,Is that a little poot at :09?,29
9,a2f2bb80,I believe you copied and paste from somewhere ...,0,1,I believe you copied and paste from somewhere ...,150


In [16]:
# print("Total training data: ", training_data.shape)
# print("Train Split: ", train_df.shape)
# print("Validation Split: ", validation_df.shape)
# print("Competition Test Data: ", competition_test.shape)

Total training data:  (24309, 5)
Train Split:  (18231, 5)
Validation Split:  (6078, 5)
Competition Test Data:  (10419, 3)


In [None]:
# training_data['inverted_predicted_label'] = make_predictions(model, tokenizer, list(training_data['text_no_emoji'].values), batch_size = 500)
# training_data['inverted_label'] = np.where(training_data.label.values==0,1,0)

In [19]:
# print(classification_report(training_data['label'], training_data['predicted_label']))

              precision    recall  f1-score   support

           0       0.90      0.63      0.74     19190
           1       0.34      0.73      0.46      5119

    accuracy                           0.65     24309
   macro avg       0.62      0.68      0.60     24309
weighted avg       0.78      0.65      0.68     24309



In [None]:
# competition_test['predicted_label'] = make_predictions(model, tokenizer, list(competition_test['text_no_emoji'].values))
# competition_test[['ID', 'label']].to_csv('ber_based_submission_1_imdb_only_trained.csv')

_____________________________

Train additionally on our data

In [18]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'inverted_label'

In [19]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train_df, validation_df, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

model.fit(train_data, epochs=1, validation_data=validation_data)





<tensorflow.python.keras.callbacks.History at 0x7f6bad145a10>

In [20]:
model.save_pretrained("ep1_model_trained_on_imdb_plus_our_data_colab_corrected")

In [21]:
validation_df['inverted_predicted_label'] = make_predictions(model, tokenizer, list(validation_df['text_no_emoji'].values), batch_size = 800)
validation_df['predicted_label'] = np.where(validation_df.inverted_predicted_label.values==0,1,0)
print(classification_report(validation_df['label'], validation_df['predicted_label']))

Batch 0
Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
6078 predictions made
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      4765
           1       0.71      0.60      0.65      1313

    accuracy                           0.86      6078
   macro avg       0.80      0.77      0.78      6078
weighted avg       0.86      0.86      0.86      6078



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
validation_df[validation_df.label!=validation_df.predicted_label].head(20)

Unnamed: 0,ID,text,label,inverted_label,text_no_emoji,char_length,inverted_predicted_label,predicted_label
19238,69f38458,He is warped by porn. He's got a problem,1,0,He is warped by porn. He's got a problem,40,1,0
4567,ae71c41f,I didn’t like that blue & yellow until you sai...,1,0,I didn’t like that blue & yellow until you sai...,61,1,0
15784,8f118d21,"Too bad he endorses seiko, this wouldve been a...",1,0,"Too bad he endorses seiko, this wouldve been a...",55,1,0
4158,de685e76,Exactly! How dare we give those people shade!,0,1,Exactly! How dare we give those people shade!,45,0,1
1282,01ddbbf4,Once they saw the purple hair dye go into the ...,0,1,Once they saw the purple hair dye go into the ...,81,0,1
2825,d8f5805b,Which is fair. But they're not respecting you.,1,0,Which is fair. But they're not respecting you.,46,1,0
10875,39bb5f7e,Terrifyingly accurate bot,0,1,Terrifyingly accurate bot,25,0,1
7479,dbc6dcf2,Google image him. Hes NEVER had a good haircut.,1,0,Google image him. Hes NEVER had a good haircut.,47,1,0
24150,a9fd557a,Believe me when i say not having a job doesn't...,1,0,Believe me when i say not having a job doesn't...,135,1,0
20208,37a0b2a1,"Whatever it is, not enough.",0,1,"Whatever it is, not enough.",27,0,1


In [23]:
competition_test['inverted_label'] = make_predictions(model, tokenizer, list(competition_test['text_no_emoji'].values))
competition_test['label'] = np.where(competition_test.inverted_label.values==0,1,0)

Batch 0
Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
Batch 8
Batch 9
Batch 10
10419 predictions made


In [24]:
competition_test[['ID', 'label']].to_csv('bert_based_imdb_plus_our_data_colab_correctly_trained_1epoch.csv',index=False)

In [25]:
model.fit(train_data, epochs=1, validation_data=validation_data)
model.save_pretrained("ep2_model_trained_on_imdb_plus_our_data_colab_corrected")



In [26]:
validation_df['inverted_predicted_label'] = make_predictions(model, tokenizer, list(validation_df['text_no_emoji'].values), batch_size = 800)
validation_df['predicted_label'] = np.where(validation_df.inverted_predicted_label.values==0,1,0)
print(classification_report(validation_df['label'], validation_df['predicted_label']))

Batch 0
Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
6078 predictions made
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      4765
           1       0.66      0.60      0.63      1313

    accuracy                           0.85      6078
   macro avg       0.78      0.76      0.77      6078
weighted avg       0.84      0.85      0.84      6078



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
competition_test['inverted_label'] = make_predictions(model, tokenizer, list(competition_test['text_no_emoji'].values))
competition_test['label'] = np.where(competition_test.inverted_label.values==0,1,0)

Batch 0
Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
Batch 8
Batch 9
Batch 10
10419 predictions made


In [29]:
competition_test[['ID', 'label']].to_csv('3_bert_based_imdb_plus_our_data_colab_correctly_trained_2epoch.csv',index=False)

In [35]:
def is_camel_case(s):
  if s != s.lower() and s != s.upper() and "_" not in s and sum(i.isupper() for i in s[1:-1]) == 1:
      return True
  return False

[False, True]