<a href="https://colab.research.google.com/github/BrouthenKamel/JunctionX_topic_classification_model/blob/main/JunctionX_topic_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing modules

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
import sklearn

### Building the dataset

In [11]:
education_file = "/content/education.txt"
entertainment_file = "/content/entertainment.txt"
technology_file = "/content/technology.txt"

In [12]:
names = ["education", "entertainment", "technology"]
file_paths = [education_file, entertainment_file, technology_file]

In [13]:
texts = dict()

for index, file_path in enumerate(file_paths):
  with open(file_path, 'r') as f:
      texts[names[index]] = f.readlines()

In [14]:
for name in names:
  texts[name] = [ text[:-2] for text in  texts[name]]

In [15]:
columns = dict()

columns["text"] = []
for name in names:
  for text in texts[name]:
    columns["text"].append(text)

In [16]:
columns["education"] = [1 for _ in range(len(texts["education"]))] + [0 for _ in range(len(texts["entertainment"]))] + [0 for _ in range(len(texts["technology"]))]
columns["entertainment"] = [0 for _ in range(len(texts["education"]))] + [1 for _ in range(len(texts["entertainment"]))] + [0 for _ in range(len(texts["technology"]))]
columns["technology"] = [0 for _ in range(len(texts["education"]))] + [0 for _ in range(len(texts["entertainment"]))] + [1 for _ in range(len(texts["technology"]))]

In [17]:
columns.keys()

dict_keys(['text', 'education', 'entertainment', 'technology'])

In [18]:
dataset = pd.DataFrame(columns, columns = columns.keys())

In [19]:
dataset.to_csv("/content/dataset.csv")

### Data cleaning

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

In [22]:
dataset['text'] = dataset["text"].apply(remove_punct)

In [23]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [24]:
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [25]:
dataset['reduced_text'] = dataset["text"].apply(remove_stopwords)

In [26]:
dataset['reduced_text'] = dataset['reduced_text'].str.replace(r'[^a-zA-Z ]', '')

  dataset['reduced_text'] = dataset['reduced_text'].str.replace(r'[^a-zA-Z ]', '')


### Vocabulary

In [27]:
from collections import Counter

def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

In [28]:
counter = counter_word(dataset["reduced_text"])
print(len(counter))
counter.most_common(5)

1474


[('education', 88),
 ('favorite', 71),
 ('technology', 64),
 ('learning', 57),
 ('used', 52)]

### Splitting the dataset

In [29]:
x = dataset["reduced_text"]
y = dataset.drop(columns = ["text", "reduced_text"])

In [30]:
pip install scikit-multilearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 KB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [31]:
from skmultilearn.model_selection import iterative_train_test_split

In [32]:
x_train, y_train, x_test, y_test = iterative_train_test_split(x.values.reshape(-1,1), y.values, test_size = 0.2)

In [33]:
y_fractions = y_test.sum(axis=0) / y.sum(axis=0)
print(y_fractions)

education        0.200772
entertainment    0.200000
technology       0.198697
dtype: float64


### Predictions & Evaluation metric

In [34]:
def get_predictions(prediction_probas, threshold):
  predictions = []

  for probas in prediction_probas:
    prediction = []
    for proba in probas:
      if proba > threshold:
        prediction.append(1)
      else:
        prediction.append(0)
    predictions.append(prediction)
  return predictions

In [59]:
def metrics(y_test, predictions):
  accuracies = dict()
  recalls = dict()
  precisions = dict()
  f1_scores = dict()
  for columns_index in range(y_test.shape[1]):
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0
    for data_point_index in range(y_test.shape[0]):
      if(y_test[data_point_index][columns_index] == 1 and predictions[data_point_index][columns_index] == 1):
        true_positives += 1
      if(y_test[data_point_index][columns_index] == 0 and predictions[data_point_index][columns_index] == 0):
        true_negatives += 1
      if(y_test[data_point_index][columns_index] == 1 and predictions[data_point_index][columns_index] == 0):
        false_negatives += 1
      if(y_test[data_point_index][columns_index] == 0 and predictions[data_point_index][columns_index] == 1):
        false_positives += 1
    accuracies[y.columns[columns_index]] = (true_positives + true_negatives) / y_test.shape[0]
    if (true_positives + false_negatives) == 0:
      recalls[y.columns[columns_index]] = 0
    else:
      recalls[y.columns[columns_index]] = true_positives / (true_positives + false_negatives)
    if (true_positives + false_positives) == 0:
      precisions[y.columns[columns_index]] = 0
    else:
      precisions[y.columns[columns_index]] = true_positives / (true_positives + false_positives)
    if ( recalls[y.columns[columns_index]] + precisions[y.columns[columns_index]] ) != 0:
      f1_scores[y.columns[columns_index]] = 2 * ( recalls[y.columns[columns_index]] * precisions[y.columns[columns_index]] ) / ( recalls[y.columns[columns_index]] + precisions[y.columns[columns_index]] )
    else:
      f1_scores[y.columns[columns_index]] = 0
  return accuracies, recalls, precisions, f1_scores

### Model

In [None]:
!pip install transformers

In [None]:
import tensorflow as tf
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')

model = transformers.TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=y.shape[1])

In [63]:
batch_size = 64
learning_rate = 1e-4
num_epochs = 20
# callback_threshold = 0.1 // callbacks=[LossCallback(callback_threshold)]

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [64]:
train_texts = x_train.flatten().tolist()
train_labels = y_train
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).shuffle(len(train_labels)).batch(batch_size)

In [65]:
model.fit(train_dataset, epochs=num_epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd73c361ee0>

In [66]:
test_texts = x_test.flatten().tolist()
test_labels = y_test
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(batch_size)
test_outputs = model.predict(test_dataset)



In [67]:
predictions_bool = tf.math.sigmoid(test_outputs.logits).numpy() > 0.5

In [68]:
predictions = []

for prediction in predictions_bool:
  predict = []
  for bool in prediction:
    predict.append(1) if bool else predict.append(0)
  predictions.append(predict)

In [69]:
accuracies, recalls, precisions, f1_scores = metrics(y_test, predictions)

In [70]:
for column in y.columns:
  print(f"{column} : \n--> accuracy = {accuracies[column]:.2f} // f1_scores = {f1_scores[column]:.2f} \n---[ recall = {recalls[column]:.2f} \n---[ precision = {precisions[column]:.2f} \n")

education : 
--> accuracy = 0.99 // f1_scores = 0.98 
---[ recall = 1.00 
---[ precision = 0.96 

entertainment : 
--> accuracy = 0.99 // f1_scores = 0.99 
---[ recall = 0.98 
---[ precision = 1.00 

technology : 
--> accuracy = 0.99 // f1_scores = 0.99 
---[ recall = 0.98 
---[ precision = 1.00 



In [73]:
model.save_pretrained("/content/model")
tokenizer.save_pretrained("/content/tokenizer")

('/content/tokenizer/tokenizer_config.json',
 '/content/tokenizer/special_tokens_map.json',
 '/content/tokenizer/vocab.txt',
 '/content/tokenizer/added_tokens.json',
 '/content/tokenizer/tokenizer.json')