In [2]:
# Cell 1

# Install required packages

!pip install -q transformers
!pip install -q fastapi
!pip install -q kaleido
!pip install -q python-multipart
!pip install -q uvicorn
!pip install -q hazm

!pip install -q clean-text[gpl]

In [3]:
# Cell 2

# Import required packages

import numpy as np
def dummy_npwarn_decorator_factory():
  def npwarn_decorator(x):
    return x
  return npwarn_decorator
np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory)

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import hazm
from cleantext import clean

import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

import os
import re
import json
import copy
import collections

In [4]:
# Cell 3
# mount google drive

from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [5]:
# Cell 4
# Read data file

tasnim = pd.read_csv("/content/drive/MyDrive/NLP4/tasnim.csv")
print(tasnim.shape)
print(tasnim.columns)
print(tasnim.head())

(63494, 5)
Index(['category', 'title', 'abstract', 'body', 'time'], dtype='object')
  category                                              title  \
0    سیاسی      میرکاظمی: زمان حذف ارز ۴۲۰۰ تومانی مشخص نیست    
1    سیاسی   طرح ۲ فوریتی شفافیت قوای سه‌گانه با ۲۰۰ امضا ...   
2    سیاسی   رئیسی انتخاب نخست وزیر جدید پاکستان را تبریک ...   
3    سیاسی   اصلاح اساسنامه شرکت شهر فرودگاهی امام خمینی (...   
4    سیاسی   ارائه طرحی کلی و مبهم برای شفافیت فرار از مطا...   

                                            abstract  \
0   رئیس سازمان برنامه و بودجه گفت: هر زمان شرایط...   
1   نماینده نیشابور در مجلس از ارائه طرح ۲ فوریتی...   
2   رئیس جمهور کشورمان طی پیامی انتخاب نخست وزیر ...   
3   اساسنامه شرکت شهر فرودگاهی امام خمینی(ره) در ...   
4   نماینده مردم تهران گفت: در شرایطی که طرح شفاف...   

                                                body  \
0  به گزارش گروه پارلمانی  ، «مسعود میرکاظمی» رئی...   
1  احسان ارکانی نماینده مردم نیشابور در مجلس شورا...   
2  به گزارش حوزه دول

In [6]:
# Cell 5
# Remove Null data and concat title, abstract and body

tasnim = tasnim.dropna(subset=["title", "abstract", "body"], how="all")
def replace_nan(s):
    if pd.isnull(s):
        return ""
    return str(s)
tasnim["allbody"] = tasnim.apply(lambda row: replace_nan(row["title"]).strip() + ". "
                                 + replace_nan(row["abstract"]).strip() + ". "
                                 + replace_nan(row["body"]).strip(), axis=1)
tasnim["allbody_len"] = tasnim.apply(lambda row: len(hazm.word_tokenize(row["allbody"])), axis=1)
print(tasnim.shape)
print(tasnim.info())

(63493, 7)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 63493 entries, 0 to 63493
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     63493 non-null  object
 1   title        63493 non-null  object
 2   abstract     63493 non-null  object
 3   body         62558 non-null  object
 4   time         63493 non-null  object
 5   allbody      63493 non-null  object
 6   allbody_len  63493 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 3.9+ MB
None


In [7]:
# Cell 6
# Check that there aren't too many outliers in terms of text word count

def data_gl_than(data, less_than=100.0, greater_than=0.0, col='allbody_len'):
    data_length = data[col].values

    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])

    data_glt_rate = (data_glt / len(data_length)) * 100

    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')

data_gl_than(tasnim, 2000, 100)

Texts with word length of greater than 100 and less than 2000 includes 88.24% of the whole!


In [8]:
# Cell 7

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()

    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)

    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)

    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)

    return text

# tasnim['allbody'] = tasnim['allbody'].apply(cleaning)
print(tasnim.shape)
tasnim = tasnim[["allbody", "category"]]

(63493, 7)


In [9]:
# Cell 8
# check how balanced the data is

fig = go.Figure()

groupby_rate = tasnim.groupby('category')['category'].count()

fig.add_trace(go.Bar(
    x=list(sorted(groupby_rate.index)),
    y=groupby_rate.tolist(),
    text=groupby_rate.tolist(),
    textposition='auto'
))

fig.update_layout(
    title_text='Distribution of rate within comments',
    xaxis_title_text='Rate',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

labels = list(sorted(tasnim['category'].unique()))
print(tasnim.head())

                                             allbody category
0  میرکاظمی: زمان حذف ارز ۴۲۰۰ تومانی مشخص نیست. ...    سیاسی
1  طرح ۲ فوریتی شفافیت قوای سه‌گانه با ۲۰۰ امضا ت...    سیاسی
2  رئیسی انتخاب نخست وزیر جدید پاکستان را تبریک گ...    سیاسی
3  اصلاح اساسنامه شرکت شهر فرودگاهی امام خمینی (ر...    سیاسی
4  ارائه طرحی کلی و مبهم برای شفافیت فرار از مطال...    سیاسی


In [None]:
# Cell 8.5
# Base Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

tasnim['label_id'] = tasnim['category'].apply(lambda t: labels.index(t))
tasnim_small, dummy = train_test_split(tasnim, test_size=0.85, random_state=1, stratify=tasnim['category'])

corpus = list(tasnim_small['allbody'])
y = np.array(list(tasnim_small['label_id']))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y, y_pred, target_names=labels))

vectorizer.get_feature_names_out()
print(X.shape, y.shape)

In [None]:
# Cell 9
# convert str label to numeric and split train and text and specify x and y

tasnim['label_id'] = tasnim['category'].apply(lambda t: labels.index(t))

train, dummy = train_test_split(tasnim, test_size=0.85, random_state=1, stratify=tasnim['category'])
train, test = train_test_split(train, test_size=0.1, random_state=1, stratify=train['category'])
train, valid = train_test_split(train, test_size=0.1, random_state=1, stratify=train['category'])


train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

x_train, y_train = train['allbody'].values.tolist(), train['label_id'].values.tolist()
x_valid, y_valid = valid['allbody'].values.tolist(), valid['label_id'].values.tolist()
x_test, y_test = test['allbody'].values.tolist(), test['label_id'].values.tolist()

print(train.shape)
print(valid.shape)
print(test.shape)



(7713, 3)
(857, 3)
(953, 3)


In [None]:
# Cell 10

from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

In [None]:
# Cell 11
# general config

MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 1
EEVERY_EPOCH = 800
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
# Cell 12

label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'اجتماعی': 0, 'استان\u200cها': 1, 'اقتصادی': 2, 'بین الملل': 3, 'رسانه ها': 4, 'سیاسی': 5, 'فرهنگی هنری': 6, 'ورزشی': 7}
id2label: {0: 'اجتماعی', 1: 'استان\u200cها', 2: 'اقتصادی', 3: 'بین الملل', 4: 'رسانه ها', 5: 'سیاسی', 6: 'فرهنگی هنری', 7: 'ورزشی'}


In [None]:
# Cell 13

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc",
    "1": "\u0627\u0633\u062a\u0627\u0646\u200c\u0647\u0627",
    "2": "\u0627\u0642\u062a\u0635\u0627\u062f\u06cc",
    "3": "\u0628\u06cc\u0646 \u0627\u0644\u0645\u0644\u0644",
    "4": "\u0631\u0633\u0627\u0646\u0647 \u0647\u0627",
    "5": "\u0633\u06cc\u0627\u0633\u06cc",
    "6": "\u0641\u0631\u0647\u0646\u06af\u06cc \u0647\u0646\u0631\u06cc",
    "7": "\u0648\u0631\u0632\u0634\u06cc"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 0,
    "\u0627\u0633\u062a\u0627\u0646\u200c\u0647\u0627": 1,
    "\u0627\u0642\u062a\u0635\u0627\u062f\u06cc": 2,
    "\u0628\u06cc\u0646 \u0627\u0644\u0645\u0644\u0644": 3,
    "\u0631\u0633\u0627\u0646\u0647 \

In [None]:
# Cell 14

class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

import time
start_time = time.time()

def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)

        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        if i % 500 == 0:
            print("test a", i, "time", time.time() - start_time)

        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

    features = glue_convert_examples_to_features(
        examples,
        tokenizer,
        maxlen,
        output_mode=output_mode,
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    iteration_counter = 0
    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)
        iteration_counter += 1
        if iteration_counter % 500 == 0:
            print("test b", iteration_counter, "time", time.time() - start_time)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features

    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [None]:
# Cell 15

train_dataset_base, train_examples = make_examples(tokenizer, x_train, y_train, maxlen=128)
valid_dataset_base, valid_examples = make_examples(tokenizer, x_valid, y_valid, maxlen=128)

test_dataset_base, test_examples = make_examples(tokenizer, x_test, y_test, maxlen=128)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, y_test, maxlen=128, is_tf_dataset=False)

  0%|          | 0/7713 [00:00<?, ?it/s]

test a 0 time 0.036449432373046875
test a 500 time 0.041101694107055664
test a 1000 time 0.04513740539550781
test a 1500 time 0.04897284507751465
test a 2000 time 0.0527644157409668
test a 2500 time 0.056528329849243164
test a 3000 time 0.068939208984375
test a 3500 time 0.5727591514587402
test a 4000 time 0.577150821685791
test a 4500 time 0.5832922458648682
test a 5000 time 0.5852439403533936
test a 5500 time 0.5864331722259521
test a 6000 time 0.5905027389526367
test a 6500 time 0.5931575298309326
test a 7000 time 0.5944170951843262
test a 7500 time 0.5992398262023926



This function will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py



  0%|          | 0/7713 [00:00<?, ?it/s]

test b 500 time 117.17132139205933
test b 1000 time 117.21084523200989
test b 1500 time 117.24635481834412
test b 2000 time 117.29564380645752
test b 2500 time 117.33595728874207
test b 3000 time 117.38957405090332
test b 3500 time 117.43122720718384
test b 4000 time 117.47709584236145
test b 4500 time 117.5168731212616
test b 5000 time 117.5592942237854
test b 5500 time 117.60407137870789
test b 6000 time 117.6423704624176
test b 6500 time 117.69180870056152
test b 7000 time 117.73196363449097
test b 7500 time 117.77709817886353


  0%|          | 0/857 [00:00<?, ?it/s]

test a 0 time 119.74153900146484
test a 500 time 119.74369144439697


  0%|          | 0/857 [00:00<?, ?it/s]

test b 500 time 133.1348352432251


  0%|          | 0/953 [00:00<?, ?it/s]

test a 0 time 133.35198855400085
test a 500 time 133.35402870178223


  0%|          | 0/953 [00:00<?, ?it/s]

test b 500 time 148.32686758041382


  0%|          | 0/953 [00:00<?, ?it/s]

test a 0 time 148.5662558078766
test a 500 time 148.5703809261322


  0%|          | 0/953 [00:00<?, ?it/s]

test b 500 time 162.9128348827362


In [None]:
# Cell 16

for value in train_dataset_base.take(1):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

     input_ids: [    2 42127  1014  3140  9359  3337  2803  2786  3021  3082  2786  2958
  4314  1013 10005  4073  3663  5038  2803  7498  1012  3411  6784 27111
  3017  1014  3140  9359  3337  2803  2786  3021  3082  2786  2958  4314
  1379  6514  4329  1379  9541  2831  2964 12060  6192  3060  1012  1012
  8382 42127  2786  7391  2799  6534  5955  1348  2786  3519  8204  3439
  3333 27111  6925  3663  5038  1379  7870  2789  3014 38857  2791  3434
  3358  2789  4280  3082  5045 27111  5650  5611  3913  2996  1014  8027
  2786  4898  3358  3663  5038  3057  3805  3949  3096  4617  1379 22009
  1348  8363  3439  4420  7555  2803  3016  2897  4898  2789  6441  5373
  2830  1379  6441  2820  2786  4898  3358  2791  3607 11414 10974  1012
  2931  2786  3251  2786  3519  3434  2927     4]
attention_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

In [None]:
# Cell 17

def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [None]:
# Cell 18

train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
valid_dataset = get_training_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

train_steps, valid_steps

(482, 53)

In [None]:
# Cell 19

def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [None]:
# Cell 20

model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Cell 21

print(valid_dataset)

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 128), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 128), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 128), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


In [22]:
%%time
# Cell 22
r = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    verbose=1)

final_accuracy = r.history['val_accuracy']
print('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))



KeyboardInterrupt: 

In [23]:
# Cell 23

# save the model

model.save_pretrained(os.path.dirname("/content/drive/MyDrive/NLP4/pytorch_model.bin"))

KeyboardInterrupt: 

In [None]:
# Cell 24
ev = model.evaluate(test_dataset_base.batch(TEST_BATCH_SIZE))
print()
print(f'Evaluation: {ev}')
print()

predictions = model.predict(xtest)
ypred = predictions[0].argmax(axis=-1).tolist()

print()
print(pd.DataFrame(classification_report(ytest, ypred, output_dict=True, target_names=labels)))
print()

# print(f'F1: {f1_score(ytest, ypred, average="weighted")}')