In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 2.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.1 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 43.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
import pickle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
import itertools
from sklearn.utils import shuffle
from tensorflow.keras import regularizers
import transformers
from transformers import BertTokenizer, TFBertModel

!nvidia-smi -L

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-a7efc9cb-2a34-82e0-1f57-93f4f69e67b5)


### Load data

Dataset: https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset?select=Reddit_Data.csv

In [3]:
%%bash
pip install kaggle
mkdir ~/.kaggle
cp kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json
kaggle datasets download cosmos98/twitter-and-reddit-sentimental-analysis-dataset -f Reddit_Data.csv
unzip Reddit_Data.csv.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading Reddit_Data.csv.zip to /content

Archive:  Reddit_Data.csv.zip
  inflating: Reddit_Data.csv         


  0%|          | 0.00/2.45M [00:00<?, ?B/s]100%|██████████| 2.45M/2.45M [00:00<00:00, 166MB/s]


- 0 Indicating it is a Neutral Comment
- 1 Indicating a Positive Comment
- -1 Indicating a Negative Comment

In [4]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = str(w)
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

In [5]:
ds = pd.read_csv('./Reddit_Data.csv')
ds.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [6]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [7]:
ds['category'].value_counts()

 1    15830
 0    13142
-1     8277
Name: category, dtype: int64

In [8]:
ds['clean_comment'] = ds['clean_comment'].map(preprocess_sentence)

In [9]:
ds.head(25)

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal wrappi...,0
4,benefit may want read living buddha living chr...,1
5,sit together watch simpsons episode lisa becom...,-1
6,teens discovered zen meditation undiagnosed bp...,1
7,jesus zen meets jew,0
8,two varieties christians dogmatic dwell words ...,-1
9,dont worry trying explain meditate regularly t...,1


### Build model

In [10]:
bert_name = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(
                          bert_name,
                          add_special_tokens=True,
                          do_lower_case=False,
                          max_length=150,
                          pad_to_max_length=True)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
tokenizer.encode_plus(" Don't be lured",
                      add_special_tokens=True,
                      max_length=9,
                      pad_to_max_length=True,
                      return_attention_mask=True,
                      return_token_type_ids=True,
                      truncation=True)



{'input_ids': [101, 1790, 112, 189, 1129, 19615, 1181, 102, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0]}

In [12]:
def bert_encoder(comment):
    txt = str(comment)
    encoded = tokenizer.encode_plus(
                      txt,
                      add_special_tokens=True,
                      max_length=150,
                      pad_to_max_length=True,
                      return_attention_mask=True,
                      return_token_type_ids=True,
                      truncation=True)
    return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [13]:
for text in ds['clean_comment'][:10]:
    print(text)

family mormon never tried explain still stare puzzled time time like kind strange creature nonetheless come admire patience calmness equanimity acceptance compassion developed things buddhism teaches
buddhism much lot compatible christianity especially considering sin suffering almost thing suffering caused wanting things want going getting things wrong way christian would mean wanting things coincide god wanting things coincide without aid jesus buddhism could also seen proof god mighty omnipotence certainly christians lucky one christ side everyone else well many christians believe god grace salvation buddhism god way showing grace upon others would also help study things jesus said see buddha made similar claims rich man getting heaven joke basically advocating rid material possessions fact distinctly remembered jesus making someone cry someone asked achieve salvation jesus replied live like buddhist roughly translated also point buddha rarely spoke anything god theory personally kn

In [14]:
bert_train = [bert_encoder(comment) for comment in ds['clean_comment']]
bert_lbl = ds['category']
bert_train = np.array(bert_train)
bert_lbl = tf.keras.utils.to_categorical(bert_lbl, num_classes=3)

### Create splits

In [15]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

x_train, x_test, y_train, y_test = train_test_split(bert_train,
                                                  bert_lbl,
                                                  test_size=0.2,
                                                  random_state=RANDOM_STATE)

x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                   y_train,
                                                   test_size=0.25,
                                                   random_state=RANDOM_STATE)

print(x_train.shape, y_train.shape)

(22349, 3, 150) (22349, 3)


In [16]:
train_reviews, train_segments, train_masks = np.split(x_train, 3, axis=1)
val_reviews, val_segments, val_masks = np.split(x_val, 3, axis=1)
test_reviews, test_segments, test_masks = np.split(x_test, 3, axis=1)

train_reviews = train_reviews.squeeze()
train_segments = train_segments.squeeze()
train_masks = train_masks.squeeze()

val_reviews = val_reviews.squeeze()
val_segments = val_segments.squeeze()
val_masks = val_masks.squeeze()

test_reviews = test_reviews.squeeze()
test_segments = test_segments.squeeze()
test_masks = test_masks.squeeze()

In [17]:
def example_to_features(input_ids, attention_masks, token_type_ids, y):
    return {
            "input_ids": input_ids,
            "attention_mask": attention_masks,
            "token_type_ids": token_type_ids}, y

In [18]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_reviews, train_masks, train_segments, y_train)).map(example_to_features).shuffle(100).batch(32)

val_ds = tf.data.Dataset.from_tensor_slices(
    (val_reviews, val_masks, val_segments, y_val)).map(example_to_features).shuffle(100).batch(32)

test_ds = tf.data.Dataset.from_tensor_slices(
    (test_reviews, test_masks, test_segments, y_test)).map(example_to_features).shuffle(100).batch(32)

### Create BERT classification model

In [19]:
bert = TFBertModel.from_pretrained(bert_name)
bert.summary()

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [20]:
max_seq_len = 150

inp_ids = keras.layers.Input((max_seq_len,), dtype=tf.int64, name='input_ids')
att_mask = keras.layers.Input((max_seq_len,), dtype=tf.int64, name='attention_mask')
seg_ids = keras.layers.Input((max_seq_len,), dtype=tf.int64, name='token_type_ids')

inp_dict = {"input_ids":inp_ids,
            "attention_mask":att_mask,
            "token_type_ids":seg_ids}
            
outputs = bert(inp_dict)

In [21]:
outputs

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<KerasTensor: shape=(None, 150, 768) dtype=float32 (created by layer 'tf_bert_model')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'tf_bert_model')>, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

In [22]:
x = keras.layers.Dropout(0.2)(outputs[1])
x = keras.layers.Dense(300, activation='relu')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(3, activation='softmax')(x)

model = keras.Model(inputs=inp_dict, outputs=x)

In [23]:
optimizer = keras.optimizers.Adam(learning_rate=2e-5)
loss = keras.losses.CategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 150)]        0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 150)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 150)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['attention_mask[0][0]',         
                                thPoolingAndCrossAt               'input_ids[0][0]',          

In [25]:
history = model.fit(train_ds, epochs=3, validation_data=val_ds)

Epoch 1/3


  return dispatch_target(*args, **kwargs)


Epoch 2/3
Epoch 3/3


In [26]:
model.evaluate(test_ds)



[0.35825440287590027, 0.8844295144081116]

In [27]:
model.save_weights('BERT_WEIGHTS_88_ACC.h5')