<a href="https://colab.research.google.com/github/BrouthenKamel/JunctionX_topic_classification_model/blob/main/JunctionX_topic_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
import sklearn

### Building the dataset

In [3]:
business_file = "/content/topics/business.txt"
education_file = "/content/topics/education.txt"
entertainment_file = "/content/topics/entertainment.txt"
health_file = "/content/topics/health.txt"
nature_file = "/content/topics/nature.txt"
philosophy_file = "/content/topics/philosophy.txt"
sports_file = "/content/topics/sports.txt"
technology_file = "/content/topics/technology.txt"

In [4]:
names = ["business", "education", "entertainment", "health", "nature", "philosophy", "sports", "technology"]
file_paths = [business_file, education_file, entertainment_file, health_file, nature_file, philosophy_file, sports_file, technology_file]

In [5]:
dataframes = dict()

for index, file_path in enumerate(file_paths):
  with open(file_path, 'r') as f:
    texts = f.readlines()
    columns = {
        'text' : [text[:-1] for text in texts],
        'label' : [names[index] for _ in range(len(texts))]
    }
    dataframes[names[index]] = pd.DataFrame(columns, columns = ['text', 'label'])

In [14]:
dataset = pd.DataFrame(columns = ['text', 'label'])

for label_name in names:
  dataset = pd.concat([dataset, dataframes[label_name]])

In [17]:
dataset = dataset.sample(frac=1, random_state=42)

In [18]:
dataset.to_csv('/content/dataset.csv')

### The new dataset

In [None]:
dataset

In [21]:
columns = dict()

for label in names:

  columns[label] = []
  for data_label in dataset.label.values:
    if label == data_label:
      columns[label].append(1)
    else:
      columns[label].append(0)

In [25]:
for column_name in names:
  dataset[column_name] = columns[column_name]

In [26]:
dataset

Unnamed: 0,text,label,business,education,entertainment,health,nature,philosophy,sports,technology
216,The study of philosophy can help us to develop...,philosophy,0,0,0,0,0,1,0,0
244,Philosophy provides us with a way to explore t...,philosophy,0,0,0,0,0,1,0,0
40,What is the role of wilderness therapy in prom...,nature,0,0,0,0,1,0,0,0
295,Badminton is a fast-paced racquet sport that r...,sports,0,0,0,0,0,0,1,0
45,What is the role of creativity in education?,education,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
232,Philosophy can help us to understand the natur...,philosophy,0,0,0,0,0,1,0,0
8,What are some common misconceptions about nature?,nature,0,0,0,0,1,0,0,0
43,What is the relationship between natural disas...,nature,0,0,0,0,1,0,0,0
207,Nature provides a space for us to connect with...,nature,0,0,0,0,1,0,0,0


### Data cleaning

In [27]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [28]:
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

In [29]:
dataset['text'] = dataset["text"].apply(remove_punct)

In [30]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [31]:
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [32]:
dataset['clean_text'] = dataset["text"].apply(remove_stopwords)

In [33]:
dataset['clean_text'] = dataset['clean_text'].str.replace(r'[^a-zA-Z ]', '')

  dataset['clean_text'] = dataset['clean_text'].str.replace(r'[^a-zA-Z ]', '')


### Vocabulary

In [35]:
from collections import Counter

def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

In [36]:
counter = counter_word(dataset["clean_text"])
print(len(counter))
counter.most_common(5)

3052


[('help', 300),
 ('businesses', 233),
 ('us', 229),
 ('sports', 179),
 ('nature', 164)]

### Splitting the dataset

In [37]:
x = dataset["clean_text"]
y = dataset.drop(columns = ["text", "clean_text", "label"])

In [None]:
pip install scikit-multilearn

In [41]:
from skmultilearn.model_selection import iterative_train_test_split

In [42]:
x_train, y_train, x_test, y_test = iterative_train_test_split(x.values.reshape(-1,1), y.values, test_size = 0.2)

In [43]:
y_fractions = y_test.sum(axis=0) / y.sum(axis=0)
print(y_fractions)

business         0.198606
education        0.200772
entertainment    0.200000
health           0.199324
nature           0.200627
philosophy       0.200000
sports           0.199461
technology       0.198697
dtype: float64


### Predictions & Evaluation metric

In [44]:
def get_predictions(prediction_probas, threshold):
  predictions = []

  for probas in prediction_probas:
    prediction = []
    for proba in probas:
      if proba > threshold:
        prediction.append(1)
      else:
        prediction.append(0)
    predictions.append(prediction)
  return predictions

In [45]:
def metrics(y_test, predictions):
  accuracies = dict()
  recalls = dict()
  precisions = dict()
  f1_scores = dict()
  for columns_index in range(y_test.shape[1]):
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0
    for data_point_index in range(y_test.shape[0]):
      if(y_test[data_point_index][columns_index] == 1 and predictions[data_point_index][columns_index] == 1):
        true_positives += 1
      if(y_test[data_point_index][columns_index] == 0 and predictions[data_point_index][columns_index] == 0):
        true_negatives += 1
      if(y_test[data_point_index][columns_index] == 1 and predictions[data_point_index][columns_index] == 0):
        false_negatives += 1
      if(y_test[data_point_index][columns_index] == 0 and predictions[data_point_index][columns_index] == 1):
        false_positives += 1
    accuracies[y.columns[columns_index]] = (true_positives + true_negatives) / y_test.shape[0]
    if (true_positives + false_negatives) == 0:
      recalls[y.columns[columns_index]] = 0
    else:
      recalls[y.columns[columns_index]] = true_positives / (true_positives + false_negatives)
    if (true_positives + false_positives) == 0:
      precisions[y.columns[columns_index]] = 0
    else:
      precisions[y.columns[columns_index]] = true_positives / (true_positives + false_positives)
    if ( recalls[y.columns[columns_index]] + precisions[y.columns[columns_index]] ) != 0:
      f1_scores[y.columns[columns_index]] = 2 * ( recalls[y.columns[columns_index]] * precisions[y.columns[columns_index]] ) / ( recalls[y.columns[columns_index]] + precisions[y.columns[columns_index]] )
    else:
      f1_scores[y.columns[columns_index]] = 0
  return accuracies, recalls, precisions, f1_scores

### Model

In [None]:
!pip install transformers

In [47]:
import tensorflow as tf
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')

model = transformers.TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=y.shape[1])

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
batch_size = 64
learning_rate = 1e-4
num_epochs = 20
# callback_threshold = 0.1 // callbacks=[LossCallback(callback_threshold)]

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [49]:
train_texts = x_train.flatten().tolist()
train_labels = y_train
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).shuffle(len(train_labels)).batch(batch_size)

In [50]:
model.fit(train_dataset, epochs=num_epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f11481c3ac0>

In [51]:
test_texts = x_test.flatten().tolist()
test_labels = y_test
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(batch_size)
test_outputs = model.predict(test_dataset)



In [52]:
predictions_bool = tf.math.sigmoid(test_outputs.logits).numpy() > 0.5

In [53]:
predictions = []

for prediction in predictions_bool:
  predict = []
  for bool in prediction:
    predict.append(1) if bool else predict.append(0)
  predictions.append(predict)

In [54]:
accuracies, recalls, precisions, f1_scores = metrics(y_test, predictions)

In [None]:
for column in y.columns:
  print(f"{column} : \n--> accuracy = {accuracies[column]:.2f} // f1_scores = {f1_scores[column]:.2f} \n---[ recall = {recalls[column]:.2f} \n---[ precision = {precisions[column]:.2f} \n")

In [73]:
type(x_test)

numpy.ndarray

In [None]:
pip install openai

In [120]:
import openai

In [121]:
openai.organization = "org-medBlt7zeElYAAr3wEmYeW4h"
openai.api_key = "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"

In [96]:
def get_label(string):
  test_string = np.array([string])

  test_texts = test_string.flatten().tolist()
  test_labels = [[0 for _ in y.columns]]

  test_encodings = tokenizer(test_texts, truncation=True, padding=True)
  test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(batch_size)
  test_outputs = model.predict(test_dataset)

  predictions_bool = tf.math.sigmoid(test_outputs.logits).numpy() > 0.5

  prediction = []
  for bool in predictions_bool[0]:
    prediction.append(1) if bool else prediction.append(0)

  target = names[np.argmax(np.array(prediction))]

  return target

In [149]:
def program():
  print("Hello user\n")
  print("Wanna play with AI ? here are topics I can detect you are talking about :\n")
  print(names)
  print("\nNow, you can give me a prompt and I can detect the topic !\n")
  string = input()
  label = get_label(string)
  print(f"\nThe topic you are talking about is : {label}\n")
  print("Now I will generate a discussion about that : \n")

  response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
      {"role": "system", "content": "You are a chatbot."},
      #{"role": "user", "content": "Who won the world series in 2020?"},
      #{"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
      {"role": "user", "content": f"Imagine you will start talking to a person interested in {label}, Start a conversation in a friendly way introducing an information directly that may interest the user, Don't put the person's interest explicit. be as human as possible, try to include an information for the user to learn in a funny way"}
    ]
  )

  reply = response["choices"][0]["message"]["content"]
  print(f"ChatBot :\n{reply}")

  return None

In [150]:
program()

Hello user

Wanna play with AI ? here are topics I can detect you are talking about :

['business', 'education', 'entertainment', 'health', 'nature', 'philosophy', 'sports', 'technology']

Now, you can give me a prompt and I can detect the topic !

Gaming

The topic you are talking about is : technology

Now I will generate a discussion about that : 

ChatBot :
Hi there! Did you know that studies have shown that the average person spends about 6 hours a day online? That's a lot of time spent on the internet! With such a big portion of our lives spent in the digital world, it's no wonder that technology is advancing at a rapid pace. Do you have any favorite tech gadgets or software that you love to use? I'm always looking for fun ways to learn more about the latest tech trends.


In [151]:
model.save_pretrained("/content/model")
tokenizer.save_pretrained("/content/tokenizer")

('/content/tokenizer/tokenizer_config.json',
 '/content/tokenizer/special_tokens_map.json',
 '/content/tokenizer/vocab.txt',
 '/content/tokenizer/added_tokens.json',
 '/content/tokenizer/tokenizer.json')