In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import LabelBinarizer
from sklearn.tree import DecisionTreeClassifier

from src.utils import load_pickle, save_pickle, split_data
from src.model import get_embedding, DivideAndConquerFNN, GPT_3, classifiers_predict
from src.train import train_fnns, classifiers_fit
from src.evaluation import evaluate

import warnings
warnings.filterwarnings('ignore')

In [None]:
# PATHS
RAW_DATA_PATH = '../data/raw/text_topics.pkl'
PROCESSED_DATA_PATH = '../data/processed/embedded.pkl'
FINETUNING_JSON_PATH = '../data/processed/data.json'
FINETUNED_MODEL_NAME = 'ada:ft-personal-2023-03-11-12-31-10'

## EDA & Data preprocessing

In [None]:
# unpickle the data
raw_data = load_pickle(RAW_DATA_PATH)
raw_data.head()

In [None]:
# drop the date and id columns
# I dont think date and id are relevant to classify the topic, it will only bias the model
# during inference date and id are going to be outside the distribution of the training data  
df = raw_data.drop(columns=["date", "id"])

In [None]:
# concatenate all topics lists into one list
topics = [topic for topics in df.topics for topic in topics]
# get unique topics
unique_topics = np.unique(topics)
print(unique_topics, f"there are {len(unique_topics)} unique topics")

In [None]:
# get the number of samples for each topic and plot it 
topic_count = [len(df[df.topics.apply(lambda x: topic in x)]) for topic in unique_topics]
plt.figure(figsize=(10, 5))
sns.barplot(x=unique_topics, y=topic_count)
plt.xlabel("Topic")
plt.ylabel("Number of samples")
plt.xticks(rotation=90)
plt.title("Number of samples per topic")
plt.show()

In [None]:
# one hot encode the topics column and store it in a list 
# this will be used to create the target column
lb = LabelBinarizer()
lb.fit(unique_topics)
df["target"] = df.topics.apply(lambda x: lb.transform(x).sum(axis=0))

In [None]:
unique_topics.tolist()

In [None]:
unique_topics

In [None]:
# what is the distribution of the number of topics per sample ?
plt.figure(figsize=(10, 5))
sns.histplot(x=df.topics.apply(lambda x: len(x)))
plt.xlabel("Number of topics")
plt.ylabel("Number of samples")
plt.title("Distribution of the number of topics per sample")
plt.show()

In [None]:
# distribution of the number of topics in a sample per topic
# sns subplots
fig, axs = plt.subplots(5, 4, figsize=(20, 15))
for topic, ax in zip(unique_topics, axs.flatten()):
    ax.set(xlabel=f"{topic}", ylabel='samples count')
    sns.histplot(x=df[df.topics.apply(lambda x: topic in x)].topics.apply(lambda x: len(x)), ax=ax)
axs.flatten()[-1].remove()
plt.show()


In [None]:
df.head()

In [None]:
# data cleaning 
# remove username and url from the text
# remove emojis 

### Embeddings

one idea would be to use pre-trained LLMs to embed each tweet, we would than use these embeddings to train a classifier (a feed-forward neural network, SVM). 

In [None]:
# ##########################################################
# ## uncomment this if you want to get your own embeddings##
# ##########################################################

# # get the embedding for each sample
# embeddings = []
# for text in df.text:
#     try:
#         embeddings.append(get_embedding(text))
#     except:
#         print("error !", "total embedded samples:", len(embeddings))
#         # save the embeddings so far
#         with open(f'../data/processed/{len(embeddings)}_embeddings.pkl', 'wb') as f:
#             pickle.dump(embeddings, f)
#         break

# # check that the number of embeddings is equal to the number of samples
# assert len(embeddings) == len(df)
# # add the embeddings to the dataframe
# df["embedding"] = embeddings
# # save the processed data
# save_pickle(df, PROCESSED_DATA_PATH)

## Modeling

In [None]:
# load the processed data
df_processed = load_pickle(PROCESSED_DATA_PATH)
x, y = df_processed.embedding.to_list(), df_processed.target.to_list()
x, y = np.array(x, dtype=np.float32), np.array(y, dtype=np.float32)
# split the data into train, validation and test sets
x_train, x_val, x_test, y_train, y_val, y_test = split_data(x, y, test_size=0.2, val_size=0.1)
x_train, x_val, x_test, y_train, y_val, y_test = torch.tensor(x_train), torch.tensor(x_val), torch.tensor(x_test), torch.tensor(y_train), torch.tensor(y_val), torch.tensor(y_test)

task : mutli-label classification (one vs. rest Classification)

experiment : feedforward neural network on top of ada-002 embeddings

In [None]:
# model and hyperparameters
input_dim = len(x_train[0])
hidden_dim = 16
number_of_labels = len(unique_topics)

classifier = DivideAndConquerFNN(input_dim, hidden_dim, number_of_labels)

batch_size = 16
num_epochs = 10

train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(x_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

learning_rate = 3e-4
optimizer = torch.optim.Adam(classifier.parameters(), lr=learning_rate)
criterions = [torch.nn.BCELoss() for _ in range(number_of_labels)]

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# train the classifier
train_fnns(classifier, train_loader, val_loader, optimizer, criterions, num_epochs, device)

In [None]:
# inference and evaluation
y_pred = [[pred.item() for pred in res] for res in classifier(x_train)]
y_pred = (np.array(y_pred)>=0.5).astype(int).T
y_truth = np.array(y_train).astype(int)

evaluate(y_truth, y_pred)

--------

Task : few-shot learning

Experiment : GPT-3

In [None]:
# few-shot prompt  
prompt = lambda text : f"""
classify tweets to their topics, available topics are: {str(unique_topics)}
tweet : Barbara Bailey is the queen of broadcast news in central & eastern Kentucky  After growing up watching her anchor the news, it was a great honor to call her coworker & friend. I loved saying “back to you, Barb” while reporting.  Cheers, {{USERNAME}} {{URL}} 
output : ['film_tv_&_video','news_&_social_concern']
tweet : Saints defense was dinged for 4 missed tackles in Week 1, tying the Texans and Colts for the 6th fewest in the NFL  Saints defense credited with 8 pressures (3 sacks), a pressure rate of 20%. Ties Rams for 14th worst in Week 1  per ProFootballReference
output : ['sports']
tweet : Earning gift cards for selling my stuff on @Listia@ ! Join me using code  BBDBPG  for an extra 250 PTS. I just listed this: LARGE KANSAS CITY LOGO MAGNET (PLEASE READ DESCRIPTION) URL
output : ['business_&_entrepreneurs']
tweet : {text}
output : [
"""

In [None]:
gpt_test_set = df.sample(10)
gpt_test_set_x = gpt_test_set.text.to_list()
gpt_test_set_y = gpt_test_set.target.apply(list).to_list()

In [None]:
# inference
responses = []
for x in gpt_test_set_x:
    prediction = GPT_3(prompt(x))
    prediction = eval("[" + prediction + "]")
    responses.append(prediction)

In [None]:
# one hot encode the responses
gpt_pred_y = [list(lb.transform(respone).sum(axis=0)) for respone in responses]

In [None]:
# evaluation
evaluate(gpt_test_set_y, gpt_pred_y)

Higher quality examples should be prepared for few-shot prompts, we could use chain-of-thoughts prompting

--------------

Task : text generation

Experiment : fine-tuning GPT-3 

report on : https://api.wandb.ai/links/dsia/xxv7s8nz

In [None]:
# save data for finetuning later on
df_ = pd.DataFrame()
df_["prompt"] = df.text
df_["completion"] = df.topics.apply(str)
df_.to_json(FINETUNING_JSON_PATH, orient="records")

check README for steps on how to fine-tune gpt-3 with openai API

In [None]:
# inference
# responses = []
# for x in gpt_test_set_x:
#     prediction = GPT_3(x + "-> [", model=FINETUNED_MODEL_NAME)
#     prediction = eval("[" + prediction + "]")
#     responses.append(prediction)

<p style="color:#990000"> fine tuned model is not respecting the syntax :p </p>

--------

Task : one vs rest classification

Experiment : different sklearn classifiers

In [None]:
classifiers = [DecisionTreeClassifier(max_depth=3) for _ in unique_topics]
classifiers_fit(classifiers, x_train, y_train)

In [None]:
y_pred = classifiers_predict(classifiers, x_test)

In [None]:
evaluate(y_test, y_pred)