#Install libraries

In [None]:
!pip install -q transformers
!pip install -q hazm
!pip install -q clean-text[gpl]
!pip install -q plotly
!pip install pyyaml==5.4.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Import libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import hazm
from cleantext import clean

import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

import os
import re
import json
import copy
import collections

from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

#Preprocessing function definition

In [None]:
def cleaning(text):
    text = text.strip()
    
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    text = re.sub("\s+", " ", text)
    
    return text

#Load and split data

In [None]:
data = pd.read_excel('dataset/Tarabord-Questions.xlsx')

y = []
X = []

for ind, row in data.iterrows():

    row = list(row.values)
    a = row[1]
    q = row[2:]

    y += [a] * len(q)
    X += q

X = list(map(lambda a: cleaning(a), X))

questions_len_by_word = [len(q.split()) for q in X]
answers_len_by_word = [len(a.split()) for a in np.unique(y)]

label_dict = {i:a for i, a in enumerate(np.unique(y))}
inv_label_dict = {a:i for i, a in enumerate(np.unique(y))}

X = np.array(X)
y = np.array(list(map(lambda a:inv_label_dict[a], y)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#Plot data distribution by number of words 

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=questions_len_by_word
))

fig.update_layout(
    title_text='Distribution of word counts within questions',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)


fig.show()

fig = go.Figure()

fig.add_trace(go.Histogram(
    x=answers_len_by_word
))

fig.update_layout(
    title_text='Distribution of word counts within answers',
    xaxis_title_text='Word Count',
    yaxis_title_text='Frequency',
    bargap=0.2,
    bargroupgap=0.2)

fig.show()

#Initialization

In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 10
LEARNING_RATE = 2e-5

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(MODEL_NAME_OR_PATH)
config.num_labels = len(label_dict)

#Data loader functions

In [None]:
class InputExample:

    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)
        
        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    features = glue_convert_examples_to_features(
        examples, 
        tokenizer, 
        maxlen, 
        output_mode=output_mode, 
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features
    
    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

train_dataset_base, train_examples = make_examples(tokenizer, X_train, y_train, maxlen=128)
valid_dataset_base, valid_examples = make_examples(tokenizer, X_valid, y_valid, maxlen=128)
test_dataset_base, test_examples = make_examples(tokenizer, X_test, y_test, maxlen=128)
[xtest, ytest], test_examples = make_examples(tokenizer, X_test, y_test, maxlen=128, is_tf_dataset=False)

  0%|          | 0/208 [00:00<?, ?it/s]


This function will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py



  0%|          | 0/208 [00:00<?, ?it/s]

#print an example of training data

In [None]:
for value in train_dataset_base.take(1):
    print(f'     input_ids: {value[0]["input_ids"]}')
    print(f'attention_mask: {value[0]["attention_mask"]}')
    print(f'token_type_ids: {value[0]["token_type_ids"]}')
    print(f'        target: {value[1]}')

     input_ids: [    2 22038  1379  6395 19733  3040  7208  2800  2789  3287  3067 22360
  1379  7660  2871  1348  2965  2791  3724  4510  2789  6115  2831 22360
  3322  1348 37200  2886  1350     4     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
attention_mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [None]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
valid_dataset = get_training_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

#model definition

In [None]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

#Model fitting

In [None]:
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

r = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    verbose=1)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#Model evaluation

In [None]:
ev = model.evaluate(test_dataset_base.batch(TEST_BATCH_SIZE), verbose = 0)
print()
print(f'Evaluation: {ev}')
print()

predictions = model.predict(xtest)
ypred = predictions[0].argmax(axis=-1).tolist()

print()
print(classification_report(ytest, ypred))
print()

print(f'F1: {f1_score(ytest, ypred, average="weighted")}')


Evaluation: [0.7103796601295471, 0.8942307829856873]


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2
           2       0.83      1.00      0.91         5
           3       1.00      0.50      0.67         6
           4       1.00      0.67      0.80         3
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         4
           7       0.80      1.00      0.89         4
           8       1.00      1.00      1.00         3
           9       1.00      0.75      0.86         4
          10       1.00      1.00      1.00         8
          11       0.80      1.00      0.89         4
          12       1.00      1.00      1.00         6
          13       1.00      0.75      0.86         4
          14       1.00      1.00      1.00         8
          15       0.75      0.60      0.67         5
          16       0.67  

#Chat

In [None]:
for i, val in enumerate(X_test):
    print('question:\n')
    print(val)
    print('\n')
    print('answer:\n')
    print(label_dict[ypred[i]])
    print("="*100)

question:

برای ترابرد سیم کارتم به همراه اول چجوری اقدام کنم؟


answer:

برای ثبت ترابرد میتوانید از طریق تماس با سامانه پشتیبانی 9990 ( با خطوط همراه اول ) یا  09129990  ( با خطوط غیر همراه اول ) قسمت ارتباط با کارشناسان اقدام نمایید.
question:

نیازمند بررسی مغایرت در اطلاعات اعلام شده در هنگام ترابرد هستم، از چه طریقی باید اقدام کنم؟


answer:

در خصوص مشکل ثبت ترابرد از واحد ترابرد همراه اول برای بررسی مغایرت در اطلاعات اعلام شده با شما تماس گرفته می شود.
question:

آیا کد اگه کد اقتصادی نداشته باشیم میشه ترابرد کرد؟


answer:

به استحضار می رساند گزینه Delivery ترابرد از درگاه join.mci.ir  از ساعت 24:00 روز سه شنبه مورخ 990431 تا اطلاع بعدی حذف شده و متقاضیان درخواست ترابرد به همراه اول، فقط امکان ثبت درخواست و مراجعه به نقاط فروش و خدمات حضوری جهت تکمیل درخواست را خواهند داشت.
question:

از کجا میتوانم به اپراتور دیگری ترابرد کنم؟


answer:

مشترک گرامی برای ترابرد به اپراتور دیگر با پشتیبانی اپراتور مدنظرتان تماس بگیرید.
question:

نام و نام خانوادگی ام رو با تازگی تغییر دادم، 