In [None]:
import os
BASE_PATH = '/content/drive/MyDrive/twitter_dashboard' 

In [None]:
!pip install -q transformers
!pip install -q hazm
!pip install -q cleantext

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import hazm
from cleantext import clean

import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

import os
import re
import json
import copy
import collections

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Importing Dataset

In [None]:
data = pd.read_csv(BASE_PATH+'/tweet.csv',encoding='utf-8')


Columns (0,5,6,11,12,13,43) have mixed types.Specify dtype option on import or set low_memory=False.



In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1196368 entries, 0 to 1196367
Data columns (total 44 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1196368 non-null  object 
 1   id                1196355 non-null  float64
 2   username          257913 non-null   object 
 3   title             257719 non-null   object 
 4   about             176930 non-null   object 
 5   members           242382 non-null   object 
 6   posts_count       242381 non-null   object 
 7   language          938461 non-null   object 
 8   avg_view          242381 non-null   float64
 9   last_dump_date    242381 non-null   object 
 10  last_cold_date    242381 non-null   object 
 11  duration_update   242373 non-null   object 
 12  following_count   242369 non-null   object 
 13  verified          242381 non-null   object 
 14  favourites_count  242381 non-null   float64
 15  location          134416 non-null   object 
 16  

In [None]:
data = data[['text','sentiment','language']]

In [None]:
print('missing values stats')
print(data.isnull().sum(), '\n')

missing values stats
text         257927
sentiment    573199
language     257907
dtype: int64 



In [None]:
data.dropna(subset=['language','sentiment','text'],inplace=True)

In [None]:
data.reset_index(inplace=True)

In [None]:
data.drop(columns=['index'],inplace=True)

In [None]:
data

Unnamed: 0,text,sentiment,language
0,آن شب اسفندماه که برای نخستین بار خبر درگذشت #...,positive,fa
1,@daydreaminblack Niiiice😂😂😂😂😂,neutral,en
2,جدا از رفتن شجریان، چهر‌ه‌های غمبار همایون و ع...,negative,fa
3,دیگه تویی که تا دیروز عکس سردار دلها رو استوری...,negative,fa
4,لازم نیست راجع‌به هر مسئله‌ای نظر بدی و موضع‌گ...,dipole,fa
...,...,...,...
623164,هیچ وقت دوست نداشتم آخرین باری که میبینمت، آخر...,positive,fa
623165,@Mohi۲۰۰۳۲۰۰۳ منفجر شد باز https: //t. co/GEBk...,neutral,fa
623166,@DSakuee دکتر در قالب فیلم بگی و با اون همه ان...,negative,fa
623167,@duzakhiasabi من غلط کنم با شکر بخورم,negative,fa


here we combine dipole and neutral sentiment in order to decrease computational complexity

In [None]:
mapping = {
    'positive' : 'positive',
    'negative' : 'negative',
    'dipole' : 'neutral',
    'neutral' : 'neutral'
}
data['sentiment'] = data['sentiment'].apply(lambda x: mapping[x])

In [None]:
data

Unnamed: 0,text,sentiment,language
0,آن شب اسفندماه که برای نخستین بار خبر درگذشت #...,positive,fa
1,@daydreaminblack Niiiice😂😂😂😂😂,neutral,en
2,جدا از رفتن شجریان، چهر‌ه‌های غمبار همایون و ع...,negative,fa
3,دیگه تویی که تا دیروز عکس سردار دلها رو استوری...,negative,fa
4,لازم نیست راجع‌به هر مسئله‌ای نظر بدی و موضع‌گ...,neutral,fa
...,...,...,...
623164,هیچ وقت دوست نداشتم آخرین باری که میبینمت، آخر...,positive,fa
623165,@Mohi۲۰۰۳۲۰۰۳ منفجر شد باز https: //t. co/GEBk...,neutral,fa
623166,@DSakuee دکتر در قالب فیلم بگی و با اون همه ان...,negative,fa
623167,@duzakhiasabi من غلط کنم با شکر بخورم,negative,fa


In [None]:
fig = go.Figure()

fa_data = data[data['language'] == 'fa']
en_data = data[data['language'] == 'en']

fa_groupby_sentiment = fa_data.groupby('sentiment')['sentiment'].count()
en_groupby_sentiment = en_data.groupby('sentiment')['sentiment'].count()

fig.add_trace(go.Bar(
    name='farsi',
    x=list(sorted(fa_groupby_sentiment.index)),
    y=fa_groupby_sentiment.tolist(),
    text=fa_groupby_sentiment.tolist(),
    textposition='auto'
))
fig.add_trace(go.Bar(
    name='english',
    x=list(sorted(en_groupby_sentiment.index)),
    y=en_groupby_sentiment.tolist(),
    text=en_groupby_sentiment.tolist(),
    textposition='auto',
))

fig.update_layout(
    title_text='Distribution of sentiment within comments',
    xaxis_title_text='Sentiment',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
del data
data = fa_data.copy()

# Normalization / Preprocessing

In [None]:
# calculate the length of comments based on their words
data['text_len_by_words'] = data['text'].apply(lambda t: len(hazm.word_tokenize(t)))

In [None]:
min_max_len = data["text_len_by_words"].min(), data["text_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')

Min: 1 	Max: 185


In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=data['text_len_by_words']
))

fig.update_layout(
    title_text='Distribution of word counts within tweets',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='text_len_by_words'):
    data_length = data[col].values

    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])

    data_glt_rate = (data_glt / len(data_length)) * 100

    print(f'Tweets with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

In [None]:
data_gl_than(data, 90, 3)

Tweets with word length of greater than 3 and less than 90 includes 96.85% of the whole!


In [None]:
minlim, maxlim = 3, 90

In [None]:
# remove comments with the length of fewer than three words
data['text_len_by_words'] = data['text_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
data = data.dropna(subset=['text_len_by_words'])
data = data.reset_index(drop=True)

## Cleaning Text
Cleaning is the final step in this section. Our cleaned method includes these steps:

- fixing unicodes
- removing specials like a phone number, email, url, new lines, ...
- cleaning HTMLs
- normalizing
- removing emojis

In [None]:
fa_stemmer = hazm.Stemmer()
fa_lemmatizer = hazm.Lemmatizer()
fa_stopwords = hazm.stopwords_list()
fa_alphabet = set(list('ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی'))

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def num_words(words):
  return len(words)

def num_distinct_words(words):
  return len(set(words))

def all_words(data):
  words = set()
  for text in data.text:
    for word in text:
      words.add(word)
  return words

def fa_normalize(text):
  text = re.sub(r"[ئيی]",'ی',text)
  text = re.sub("[ك]",'ک',text)
  text = re.sub("[ؤ]",'و',text)
  text = re.sub("[ة]",'ه',text)
  text = re.sub(r"[إأآا]","ا", text)
  text = re.sub(
            r"[^ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]"," ",text
        )
  text = re.sub(r'(.)\1+',r'\1',text)
  text = re.sub(r"[^\S\n\t]+", ' ',text)
  return text

def stemLemmaStopWord(stemmer, lemmatizer, stopwords, alphabet, tokens):
  final_tokens = []
  for token in tokens:
    stemmed_token = stemmer.stem(lemmatizer.lemmatize(token))
    if '#' in stemmed_token:
      stemmed_token = stemmed_token.split('#')[0]
    if token not in stopwords and stemmed_token not in stopwords and not token == '' and stemmed_token not in alphabet:
      final_tokens.append(stemmed_token)
  return final_tokens 

def cleaning(text):
    text = text.strip()
    text = fa_normalize(text)
    # regular cleaning
    # text = clean(text,extra_spaces=True,lowercase=True,numbers=True,punct=True)

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    # normalizer = hazm.Normalizer()
    # text = normalizer.normalize(text)
    tokens = hazm.word_tokenize(text)
    text = ' '.join(stemLemmaStopWord(fa_stemmer, fa_lemmatizer, fa_stopwords, fa_alphabet, tokens))
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        u""
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    # text = re.sub("#", "", text)
    # text = re.sub("\s+", " ", text)
    # text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return text

In [None]:
# cleaning text of tweets
data['cleaned_text'] = data['text'].apply(cleaning)


# calculate the length of tweets based on their words
data['cleaned_text_len_by_words'] = data['cleaned_text'].apply(lambda t: len(hazm.word_tokenize(t)))

# remove tweets with the remaining length of fewer than three words
data['cleaned_text_len_by_words'] = data['cleaned_text_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
data = data.dropna(subset=['cleaned_text_len_by_words'])
data = data.reset_index(drop=True)

In [None]:
print('text:',data['text'][11500])
print('cleaned_text',data['cleaned_text'][11500])

text: فیوچر واسه چهلوهشتم* نمیخواد یه مداحی بده؟ 🕌
*چهلوهشتم اصطلاحی برای اهالی خراسان است که مصادف میشود هشت روز بعد از اربعین که به عبارتی چهلم گویند بنابراین چهلوهشتم مصادف است با ۲۸صفر. https: //t. co/QBJi۱CSrrI
cleaned_text فیوچر واسه چهلوه نمیخواد یه مداح چهلوه اصطلاح اهال خراس مصادف میشود روز اربعین عبار چهل چهلوه مصادف صفر


In [None]:
data.drop(columns=['language'],inplace=True)

## Handling unbalanced data

In [None]:
fig = go.Figure()
groupby_sentiment = data.groupby('sentiment')['sentiment'].count()
fig.add_trace(go.Bar(
    x=list(sorted(groupby_sentiment.index)),
    y=groupby_sentiment.tolist(),
    text=groupby_sentiment.tolist(),
    textposition='auto'
))
fig.update_layout(
    title_text='Distribution of sentiments within persian tweets',
    xaxis_title_text='Value',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

In [None]:
sentiments = data['sentiment'].unique().tolist()
print(f'We have #{len(sentiments)} values: {sentiments}')

We have #3 values: ['positive', 'negative', 'neutral']


so we use upsampling in order to balance classes

In [None]:
negative_data = data[data['sentiment'] == 'negative']
positive_data = data[data['sentiment'] == 'positive']
neutral_data = data[data['sentiment'] == 'neutral']
# dipole_data = data[data['sentiment'] == 'dipole']

cutting_point = max(len(negative_data), len(positive_data),len(neutral_data))#,len(dipole_data))

samples_collection = [negative_data,positive_data,neutral_data]#,dipole_data]

for data_portion in [negative_data,positive_data,neutral_data]:#,dipole_data]:
  if len(data_portion) < cutting_point:
    remaining_count = cutting_point - len(data_portion)
    sample = data_portion.sample(n=remaining_count,replace=True).reset_index(drop=True)
    samples_collection.append(sample)

new_data = pd.concat(samples_collection)
new_data = new_data.sample(frac=1).reset_index(drop=True)

In [None]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332577 entries, 0 to 332576
Data columns (total 5 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   text                       332577 non-null  object 
 1   sentiment                  332577 non-null  object 
 2   text_len_by_words          332577 non-null  float64
 3   cleaned_text               332577 non-null  object 
 4   cleaned_text_len_by_words  332577 non-null  float64
dtypes: float64(2), object(3)
memory usage: 12.7+ MB


In [None]:
fig = go.Figure()
groupby_sentiment = new_data.groupby('sentiment')['sentiment'].count()
fig.add_trace(go.Bar(
    x=list(sorted(groupby_sentiment.index)),
    y=groupby_sentiment.tolist(),
    text=groupby_sentiment.tolist(),
    textposition='auto'
))
fig.update_layout(
    title_text='Distribution of sentiments within "upsampled" persian tweets',
    xaxis_title_text='Value',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

# Train,Validation and Test sets
To achieve a globalized model, we need to split the cleaned dataset into train, valid, test sets due to size of the data. In this tutorial, I have considered a rate of 0.1 for both valid, test sets. For splitting, I use train_test_split provided by Sklearn package with stratifying on the label for preserving the distribution balance.

In [None]:
new_data

Unnamed: 0,text,sentiment,text_len_by_words,cleaned_text,cleaned_text_len_by_words
0,@rouzbeh_karimi نه بابا الانم دوسش دارم. شخصیت...,positive,26.0,بابا الان دوس شخص پوارو دوس اون پوارو ضعیف,9.0
1,یکی به گرداننده《 هیات》دولت بگه، به مردم ناتوان...,positive,56.0,گردانده دول بگه مرد ناتو ته دس روز ضعیف ماسک ...,33.0
2,@SoaaleSadeh بله طراحی ایشون برای آرامگاه فردو...,neutral,17.0,بله طراح ایشون ارامگاه فردوس طرح اندره گدار مید,9.0
3,@ManotoNews شهید کیر خر شدن سر دول قاسم سلیمان...,positive,27.0,شهید کیر خر سر دول قاس سل خانواده محتر تلویزیو...,14.0
4,@hatakarimi ان‌شاالله از شما در آثار بعد رونما...,neutral,12.0,شاله اثار رونما میکن,4.0
...,...,...,...,...,...
332572,خدا رو شکر این کاره افتاد همین امروز شرش کنده ...,neutral,15.0,خدا شکر کاره افتاد امروز شر کنده شه فردا نگاد,10.0
332573,@pegah_۹۷p این محبوبت بو داره، برو محبوبی درست...,neutral,14.0,محبوب بو داره برو محبوب درس انتخاب کن,8.0
332574,دلار ۳۱هزار تومان. \n\nبرای قبیله ج. ا. و کسان...,negative,69.0,دلار هزار قبیله سفره ان کباب بره خال عد معنا ف...,25.0
332575,امام جمعه اصفهان می‌خواد اسیدپاشی راه بندازه! ...,negative,54.0,جمعه اصفه خواد اسیدپا بندازه وضع رژ انفجار خوا...,24.0


In [None]:
labels = list(sorted(data['sentiment'].unique()))
new_data['sentiment_id'] = new_data['sentiment'].apply(lambda t: labels.index(t))

In [None]:
new_data.drop(columns=['text_len_by_words','cleaned_text_len_by_words','text'],inplace=True)

In [None]:
train, test = train_test_split(new_data, test_size=0.1, random_state=1, stratify=new_data['sentiment'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['sentiment'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['cleaned_text'].values, train['sentiment_id'].values
x_valid, y_valid = valid['cleaned_text'].values, valid['sentiment_id'].values
x_test, y_test = test['cleaned_text'].values, test['sentiment_id'].values

# Finetuning `PARSBERT` model on data using tensorflow

![BERT INPUTS](https://res.cloudinary.com/m3hrdadfi/image/upload/v1595158991/kaggle/bert_inputs_w8rith.png)

As you may know, the BERT model input is a combination of 3 embeddings.
- Token embeddings: WordPiece token vocabulary (WordPiece is another word segmentation algorithm, similar to BPE)
- Segment embeddings: for pair sentences [A-B] marked as $E_A$ or $E_B$ mean that it belongs to the first sentence or the second one.
- Position embeddings: specify the position of words in a sentence

In [None]:
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

In [None]:
# general config
MAX_LEN = 64
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
TEST_BATCH_SIZE = 64


MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased-sentiment-digikala'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-twitter/tf_model.h5'
SAVE_PATH = '/content/bert-fa-base-uncased-sentiment-twitter/best_model.h5'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'negative': 0, 'neutral': 1, 'positive': 2}
id2label: {0: 'negative', 1: 'neutral', 2: 'positive'}


In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

{
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "digikala",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.2.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}



### Input Embeddings / Dataset

In [None]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)
        
        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    features = glue_convert_examples_to_features(
        examples, 
        tokenizer, 
        maxlen, 
        output_mode=output_mode, 
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features
    
    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [None]:
train_dataset_base, train_examples = make_examples(tokenizer, x_train, y_train, maxlen=MAX_LEN)
valid_dataset_base, valid_examples = make_examples(tokenizer, x_valid, y_valid, maxlen=MAX_LEN)

test_dataset_base, test_examples = make_examples(tokenizer, x_test, y_test, maxlen=MAX_LEN)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, y_test, maxlen=MAX_LEN, is_tf_dataset=False)

HBox(children=(FloatProgress(value=0.0, max=269387.0), HTML(value='')))





This function will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/master/examples/text-classification/run_glue.py



HBox(children=(FloatProgress(value=0.0, max=269387.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=29932.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=29932.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33258.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33258.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33258.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=33258.0), HTML(value='')))




In [None]:
for value in train_dataset_base.take(1):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

     input_ids: [    2 62472  2998 75610  2005  3006 25913  2827  3570  5796  2005 31309
  4361 11112  6038     4     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
attention_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
token_type_ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
        target: 1


In [None]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [None]:
train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
valid_dataset = get_training_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

print('training steps:', train_steps)
print('validation steps:', valid_steps)

training steps: 4209
validation steps: 467


## Model

In [None]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [None]:
EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 3e-5
CLIP = 0.0
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='valid_loss', patience=2)
checkpoint = tf.keras.callbacks.ModelCheckpoint(SAVE_PATH, monitor='loss', verbose=1,save_best_only=True, mode='auto', period=1)



In [None]:
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

Some layers from the model checkpoint at HooshvareLab/bert-fa-base-uncased-sentiment-digikala were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased-sentiment-digikala.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


## Training Model

In [None]:
%%time

r = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    callbacks=[early_stopping,checkpoint],
    verbose=1)

final_accuracy = r.history['val_accuracy']
print('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 1/3


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.



Epoch 00001: loss improved from inf to 0.65514, saving model to /content/bert-fa-base-uncased-sentiment-twitter/best_model.h5
Epoch 2/3

Epoch 00002: loss improved from 0.65514 to 0.44118, saving model to /content/bert-fa-base-uncased-sentiment-twitter/best_model.h5
Epoch 3/3

Epoch 00003: loss improved from 0.44118 to 0.30962, saving model to /content/bert-fa-base-uncased-sentiment-twitter/best_model.h5
FINAL ACCURACY MEAN:  0.8162361780802408
CPU times: user 38min 13s, sys: 21min 5s, total: 59min 18s
Wall time: 2h 53min 37s


In [None]:
model.save_pretrained(os.path.dirname(OUTPUT_PATH))

In [None]:
ev = model.evaluate(test_dataset_base.batch(TEST_BATCH_SIZE))
print()
print(f'Evaluation: {ev}')
print()

predictions = model.predict(xtest)
ypred = predictions[0].argmax(axis=-1).tolist()

print()
print(classification_report(ytest, ypred, target_names=labels))
print()

print(f'F1: {f1_score(ytest, ypred, average="weighted")}')


Evaluation: [0.5140472650527954, 0.835438072681427]



The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.



              precision    recall  f1-score   support

    negative       0.88      0.91      0.90     11086
     neutral       0.75      0.84      0.79     11086
    positive       0.89      0.75      0.82     11086

    accuracy                           0.84     33258
   macro avg       0.84      0.84      0.84     33258
weighted avg       0.84      0.84      0.84     33258


F1: 0.8355315523517394
