In [345]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preliminaries

In [18]:
!pip install parsivar

In [35]:
!wget https://raw.githubusercontent.com/ziaa/Persian-stopwords-collection/master/Stopwords/Savoy/persianST.txt

In [304]:
import tensorflow as tf
import json
import random
from collections import Counter, defaultdict
from parsivar import Normalizer, Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [57]:
config = {
    'max_features': 1000,
    'num_epochs':200,
    'learning_rate':1e-1,
    'batch_size':32,
    'train_percentage':90
}

In [3]:
with open('../input/farsnews1399/fars-news-1399.json', 'r', encoding='utf-8') as file:
    farsnews = json.load(file)

# Preprocessing

## Data Exploration

In [4]:
type(farsnews)

In [5]:
print(f'The dataset has {len(farsnews)} items\nFor instance, the first object is {farsnews[0]}')

In [6]:
farsnews = [[item['abstract'], item['cat']] for item in farsnews]

Getting some random samples from the dataset

In [7]:
chosen_items = random.sample(farsnews, 5)
for item in chosen_items:
    print(f'Abstract of the news:\n{item[0]}\nCategory:{item[1]}\n')

In [8]:
print(f'All Categories: {set([item[1] for item in farsnews])}')

## Pruning the categories

In [9]:
chosen_categories = {'education','sports','politics','scientific-academic', 'economy'}
farsnews = [item for item in farsnews if item[1] in chosen_categories]

In [10]:
print(f'The dataset contains {len(farsnews)} articles now')

In [11]:
news_counter = Counter()
for news, category in farsnews:
    news_counter[category] += 1

In [12]:
for category, count in dict(news_counter).items():
    print(f'Category "{category}": {count} items\n')

## Balancing the categories in terms of the number of items belonging to each of them

Since the "education" category has the minimum number of articles belonging to it, we downsample the other groups so that they only contain the same number of articles as the "education" has.

In [13]:
news_dict = defaultdict(list)
for article, category in farsnews:
    news_dict[category].append(article)

In [14]:
min_category_count = min(list(dict(news_counter).values()))
for category in chosen_categories:
    news_dict[category] = random.sample(news_dict[category], min_category_count)

news_counter = Counter()
for category in chosen_categories:
    news_counter[category] = len(news_dict[category])
    
for category, count in dict(news_counter).items():
    print(f'Category "{category}": {count} items\n')

In [15]:
balanced_farsnews = []
balanced_farsnews_labels = []
for category in chosen_categories:
    balanced_farsnews.extend(news_dict[category])
    balanced_farsnews_labels.extend([category for item in range(len(news_dict[category]))])

In [16]:
print(f'We have {len(balanced_farsnews)} articles and {len(balanced_farsnews_labels)} labels now\nNote that only {len(set(balanced_farsnews_labels))} labels are unique')

In [17]:
chosen_items_idx = random.sample([i for i in range(len(balanced_farsnews))], 5)
for idx in chosen_items_idx:
    print(f'Abstract of the news:\n{balanced_farsnews[idx]}\nCategory:{balanced_farsnews_labels[idx]}\n')

## Normalization

In [20]:
normalizer = Normalizer(statistical_space_correction=True)

In [36]:
with open('./persianST.txt.1', 'r', encoding='utf-8') as file:
    stopwords = file.readlines()
stopwords = set([item.strip() for item in stopwords])
print(list(stopwords)[:10])

In [21]:
balanced_farsnews = [normalizer.normalize(article) for article in balanced_farsnews]

In [29]:
balanced_farsnews = [re.sub(r'[0-9]+', '', article) for article in balanced_farsnews]

In [37]:
tokenizer = Tokenizer()
def remove_stopwords(txt):
    tokens = tokenizer.tokenize_words(txt)
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)
balanced_farsnews = [remove_stopwords(article) for article in balanced_farsnews]

In [38]:
for idx in chosen_items_idx:
    print(f'Abstract of the news:\n{balanced_farsnews[idx]}\nCategory:{balanced_farsnews_labels[idx]}\n')

## Feature Extraction

# TF-IDF

Term Frequency: $\mathrm{tf}(t,d) = \dfrac{\mbox{How many times the token t has appeared in doc d}}{\mbox{How many tokens are inside doc d}}$</br></br>
Document Frequency: $\mathrm{df}(t) = \mbox{Number of documents in which the token t has appeared}$</br></br>
$w(t,d) = \mathrm{tf}(t,d)\times \log(\dfrac{\mbox{The total number of documents}}{\mathrm{df}(t)})$</br></br>
Note: We usually add some ones to the log term to make it non-zero or at least one.

In [39]:
vectorizer = TfidfVectorizer(max_features=config['max_features'])
features = vectorizer.fit_transform(balanced_farsnews)

In [40]:
vectorizer.get_feature_names_out()

In [41]:
np_features = features.toarray()

In [42]:
np_features.shape

In [43]:
cat_to_id = {}
id_to_cat = {}
for idx, cat in enumerate(chosen_categories):
    cat_to_id[cat] = idx
    id_to_cat[idx] = cat
balanced_farsnews_labels = [cat_to_id[cat] for cat in balanced_farsnews_labels]

In [45]:
labels = np.array(balanced_farsnews_labels)

In [46]:
labels.shape

In [51]:
labels[:10]

## One-Hot Encoding

In [52]:
tmp = [[0,0,0,0,0] for item in labels]
for idx, (item, label) in enumerate(zip(tmp, labels)):
    tmp[idx][label] = 1

In [53]:
tmp[:10]

In [54]:
labels = np.array(tmp)

In [55]:
labels[:10]

In [56]:
labels.shape

In [265]:
labels = labels.reshape(-1, 1, 5)

# Train/Dev/Test Split

In [266]:
f_train, f_rem, l_train, l_rem = train_test_split(np_features, labels, test_size=1-config['train_percentage']/100, random_state=50)
f_test, f_dev, l_test, l_dev = train_test_split(f_rem, l_rem, test_size=0.5, random_state=50)

In [267]:
print(f'train features: {f_train.shape}, dev features: {f_dev.shape}, test features: {f_test.shape}')

In [268]:
print(f'train labels: {len(l_train)}, dev labels: {len(l_dev)}, test labels: {len(l_test)}')

# Making the Dataset(s)

In [269]:
train_dataset = tf.data.Dataset.from_tensor_slices((f_train, l_train))
dev_dataset = tf.data.Dataset.from_tensor_slices((f_dev, l_dev))
test_dataset = tf.data.Dataset.from_tensor_slices((f_test, l_test))

# Neural Network Definition

In [358]:
class my_neural_net(tf.keras.Model):
    def __init__(self):
        super(my_neural_net, self).__init__() # first of all, call the init method of the class it extends from
        self.all_in_one = tf.keras.Sequential([
                                               tf.keras.layers.Dense(20, activation='relu', input_shape=(1000,)),
                                               tf.keras.layers.Dense(5, activation='softmax')
        ])
    def call(self, x: tf.Tensor) -> tf.Tensor:
        x = tf.reshape(x, (-1, 1, 1000))
        output = self.all_in_one(x)
        return output[0]

In [359]:
model = my_neural_net()

In [360]:
model.build((1,1000))

In [361]:
model.summary()

In [362]:
model(f_train[0])

# Loss and Optimizer

<div style="font-size:15pt">
$CE = -\sum_{i=1}^{C} y_i \log(\mathrm{pred}_i)$</br></br>
Note that $y_i$ is a one-hot vector (the true output), and $\mathrm{pred}_i$ is another vector containing scores each indicating the predicted probability of belonging to a class
</div>

In [363]:
loss_fn = tf.keras.losses.CategoricalCrossentropy()

In [364]:
optimizer = tf.keras.optimizers.SGD(config['learning_rate'])

In [365]:
metrics = ['accuracy']

In [366]:
model.compile(optimizer, loss_fn, metrics)

In [367]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [368]:
history = model.fit(train_dataset, epochs=config['num_epochs'], validation_data=dev_dataset, callbacks=[callback])

In [369]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [370]:
test_loss, test_acc = model.evaluate(test_dataset, batch_size=config['batch_size'])
print(f'test loss: {test_loss}, test accuracy: {test_acc}')

# Inference

In [371]:
id_to_cat[np.argmax(model(f_test[123]).numpy()[0])]

In [372]:
id_to_cat[np.argmax(l_test[123][0])]

# Save/Load the Model

In [373]:
model.save('./classifier')

In [377]:
model.save_weights('./classifier-weights')

## Load the weights

In [378]:
new_model = my_neural_net()

In [379]:
new_model.load_weights('./classifier-weights')

In [380]:
id_to_cat[np.argmax(new_model(f_test[123]).numpy()[0])]

## Load the whole model

In [381]:
another_model = tf.keras.models.load_model('./classifier')

In [382]:
id_to_cat[np.argmax(new_model(f_test[123]).numpy()[0])]