In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
from nltk.corpus import stopwords
import re 

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

In [5]:

def preprocess_sentence(w):
    w = w.lower()
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

In [6]:
df=pd.read_csv('Train.csv')
print(df.shape)
df.head()

(40000, 2)


Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [9]:
df['review']=df['review'].apply(preprocess_sentence)

In [10]:
df.head()

Unnamed: 0,review,label
0,mature intelligent highly charged melodrama un...,pos
1,http video google com videoplay docid distribu...,pos
2,title opera director dario argento cast cristi...,pos
3,think lot people wrote another one tom cruise ...,pos
4,story two dogs cat looking way back home old w...,pos


In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le=LabelEncoder()
df['label']=le.fit_transform(df['label'])

In [13]:
df.head()

Unnamed: 0,review,label
0,mature intelligent highly charged melodrama un...,1
1,http video google com videoplay docid distribu...,1
2,title opera director dario argento cast cristi...,1
3,think lot people wrote another one tom cruise ...,1
4,story two dogs cat looking way back home old w...,1


In [19]:
num_classes=np.unique(df['label'],return_counts=True)
num_classes

(array([0, 1]), array([19989, 20011]))

In [15]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |▏                               | 10kB 24.1MB/s eta 0:00:01[K     |▎                               | 20kB 30.3MB/s eta 0:00:01[K     |▍                               | 30kB 34.3MB/s eta 0:00:01[K     |▋                               | 40kB 29.7MB/s eta 0:00:01[K     |▊                               | 51kB 30.6MB/s eta 0:00:01[K     |▉                               | 61kB 29.0MB/s eta 0:00:01[K     |█                               | 71kB 27.0MB/s eta 0:00:01[K     |█▏                              | 81kB 28.5MB/s eta 0:00:01[K     |█▎                              | 92kB 27.0MB/s eta 0:00:01[K     |█▍                              | 102kB 27.3MB/s eta 0:00:01[K     |█▌                              | 112kB 27.3MB/s eta 0:00:01[K     |█▊                              | 

In [17]:
from transformers import BertTokenizer,TFBertModel,BertConfig

In [21]:
from transformers import TFBertForSequenceClassification

In [22]:
token=BertTokenizer.from_pretrained('bert-base-uncased')
model=TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
sentences=df['review']
labels=df['label']
len(sentences),len(labels)

(40000, 40000)

In [27]:
input_ids=[]
att_mask=[]

for sent in sentences:
  inps=token.encode_plus(sent,add_special_tokens=True,max_length =64,pad_to_max_length = True,return_attention_mask = True)
  input_ids.append(inps)
  att_mask.append(inps['attention_mask'])

input_ids=np.array(input_ids)
att_mask=np.array(att_mask)
label=np.array(df['label'])




In [29]:
len(input_ids),len(att_mask),len(labels)

(40000, 40000, 40000)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(input_ids,labels,att_mask,test_size=0.2)

In [32]:
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2),
    tf.keras.callbacks.ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5'),
    tf.keras.callbacks.TensorBoard(log_dir='./logs'),
]

In [33]:
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

In [34]:
model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

In [35]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [43]:
history=model.fit([train_inp,train_mask],train_label,batch_size=32,epochs=4,validation_data=([val_inp,val_mask],val_label),callbacks=my_callbacks)

Epoch 1/4


TypeError: ignored

In [39]:
from sklearn.metrics import classification_report,f1_score

In [41]:
preds = model.predict([val_inp,val_mask],batch_size=32)
pred_labels = preds.argmax(axis=1)
f1=f1_score(val_label,pred_labels)
print('F1 score',f1)
print('Classification Report')
print(classification_report(val_label,pred_labels,target_names=target_names))



TypeError: ignored