# Generative AI Text Classification

In [None]:
# imports

import matplotlib.pyplot as plt
import seaborn as sns
import opendatasets as od
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv
import s3fs
import fs_s3fs
import fsspec
import json
from llama_index.core import TreeIndex, SimpleDirectoryReader
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
import tensorflow as tf
import keras
import transformers
import mlflow
import hyperopt as hp
import sphinx
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from tensorflow.keras.optimizers import SGD

In [None]:
# download stopwords

#nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
newStopwords = ['b','lt','gt','n','u','ap','reuters'] # Add stopwords 

for stopword in newStopwords:
    stopwords.append(stopword)

In [None]:
stopwords

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# Download dataset from Kaggle

dataset = "https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset/data"
od.download(dataset)

In [None]:
# Read dataset, import only 30000 rows of data

df = pd.read_csv(r'C:\Users\nickr\OneDrive\Documents\GitHub\generative-ai-text-summarization\data\ag-news-classification-dataset\ag_news.csv',nrows=30000)

In [None]:
# Check shape of dataframe

df.shape

In [None]:
# Confirm importation

df.head(8)

In [None]:
# Confirm shape

df.shape

In [None]:
# Drop Description column

df = df.drop('Description',axis=1)

# Data Cleaning and Preprocessing

In [None]:
# find null values and datatypes

df.info(memory_usage='deep')

There are no null values in the df dataset.

In [None]:
# check for duplicates

df.duplicated().sum()

There are 1354 duplicate values in the df dataset.

In [None]:
# Drop duplicates

df = df.drop_duplicates()

In [None]:
# Cleaning data set html, special, and non-textual characters

def cleaning_text(text):
    # Remove HTML tags
    cleaning_text = re.sub('<.*?>', '', text)
    # Remove special characters and non-textual 
    cleaning_text = re.sub(r'([^a-zA-Z\s]|\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', ' ', cleaning_text) # checks plain text for given characters
    return cleaning_text

In [None]:
# apply text cleaning to text in both Description and Title

df['Title'] = df['Title'].apply(cleaning_text)

In [None]:
# Check that the function worked

df.head(7)

Note that in this data set, 1 represents World News, 2 represents Sports News, 3 represents Business News, and 4 represents Sci/Tech news

In [None]:
# Create a function to remove stop words

stop_words = set(stopwords)
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [word for word in tokens if word not in stop_words]  # Stop word removal
    return ' '.join(tokens)

In [None]:
# apply preprocessing to text in Title

df['Title'] = df['Title'].apply(preprocess_text)

In [None]:
# Check that the function worked

df.head(7)

In [None]:
# convert to CSV for ease of use in future

cleaned_data_file = r'C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\ag_news_cleaned\cleaned_ag_news.csv'
df.to_csv(cleaned_data_file, index=False)

# Data Splitting

In [None]:
# Split training data into training and validation data 

df_train, df_val = train_test_split(df, test_size=.15, random_state=42)

In [None]:
# Create csv file for train and test data

df_train.to_csv(os.path.join(r'C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\ag_news_cleaned', 'train.csv'), index=False)
df_val.to_csv(os.path.join(r'C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\ag_news_cleaned', 'test.csv'), index=False)

In [None]:
# Create feature data directory

feature_data_dir = r'C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\features'
os.makedirs(feature_data_dir, exist_ok=True)

In [None]:
# TF-IDF Vectorization for Title

tfidf_vectorizer = TfidfVectorizer(max_features=3000)  # we can play around with this. This was an arbitrary value
train_title_features = tfidf_vectorizer.fit_transform(df_train['Title'])
test_title_features = tfidf_vectorizer.transform(df_val['Title'])

In [None]:
# look at one of the matrices the vectorizer produces

# print(df_train['Title'][98])
# print(train_title_features.toarray()[98]) 

Note, the vectorizer produces a value for a specific word on a scale of 0 to 1. The closer the number is to 1, the more unique that word is.

In [None]:
# Print our features

features = tfidf_vectorizer.get_feature_names_out()
print(tfidf_vectorizer.vocabulary_, end=' ')

In [None]:
# Confirm feature number

print(len(features))

In [None]:
# Save the TF-IDF feature matrices

#pd.DataFrame(train_desc_features.toarray()).to_csv(os.path.join(feature_data_dir, 'train_desc_features.csv'), index=False)
#pd.DataFrame(test_desc_features.toarray()).to_csv(os.path.join(feature_data_dir, 'test_desc_featuress.csv'), index=False)
#pd.DataFrame(train_title_features.toarray()).to_csv(os.path.join(feature_data_dir, 'train_title_features.csv'), index=False)
#pd.DataFrame(test_title_features.toarray()).to_csv(os.path.join(feature_data_dir, 'test_title_featuress.csv'), index=False)

# EDA

In [None]:
# Sum the counts of each index
class_counts = df_train['Class Index'].value_counts().reset_index()

# Visualize class distribution
plt.figure(figsize=(10, 6))
sns.barplot(data=class_counts, x='Class Index', y='count', hue='count')
plt.title('Class Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Summary statistics of class distribution

class_balance = class_counts.describe()
print("Class Balance:")
print(class_balance)

Note that there is a fairly even distribution of categories in our training dataset. No further resampling techniques needed.

In [None]:
# Splitting dataframes by category

index_one = df_train['Class Index'] == 1
df_index_one = df_train[index_one]

index_two = df_train['Class Index'] == 2
df_index_two = df_train[index_two]

index_three = df_train['Class Index'] == 3
df_index_three = df_train[index_three]

index_four = df_train['Class Index'] == 4
df_index_four = df_train[index_four]

In [None]:
# Gather most common words for category World News

index_list_one = ' '.join(df_index_one['Title']).split()
word_counts_one = Counter(index_list_one)
one_common_words = word_counts_one.most_common(30)
print("\nWorld News - Most Common Words:")
print(one_common_words)

In [None]:
# Gather most common words for category Sports News

index_list_two = ' '.join(df_index_two['Title']).split()
word_counts_two = Counter(index_list_two)
two_common_words = word_counts_two.most_common(30)
print("\nSports News - Most Common Words:")
print(two_common_words)

In [None]:
# Gather most common words for category Business News

index_list_three = ' '.join(df_index_three['Title']).split()
word_counts_three = Counter(index_list_three)
three_common_words = word_counts_three.most_common(30)
print("\nBusiness News - Most Common Words:")
print(three_common_words)

In [None]:
# Gather most common words for category Sci/Tech News

index_list_four = ' '.join(df_index_four['Title']).split()
word_counts_four = Counter(index_list_four)
four_common_words = word_counts_four.most_common(30)
print("\nSci/Tech News - Most Common Words:")
print(four_common_words)

In [None]:
# Vizualize word frequency for World News

plt.figure(figsize=(10, 6))
sns.barplot(x=[word[0] for word in one_common_words], y=[word[1] for word in one_common_words])
plt.title('Most Common Words in World News')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.xticks(rotation=60)
plt.show()

In [None]:
# Vizualize word frequency for Business News

plt.figure(figsize=(10, 6))
sns.barplot(x=[word[0] for word in two_common_words], y=[word[1] for word in two_common_words])
plt.title('Most Common Words in Sports News')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.xticks(rotation=60)
plt.show()

In [None]:
# Vizualize word frequency for World News

plt.figure(figsize=(10, 6))
sns.barplot(x=[word[0] for word in three_common_words], y=[word[1] for word in three_common_words])
plt.title('Most Common Words in Business News')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.xticks(rotation=60)
plt.show()

In [None]:
# Vizualize word frequency for World News

plt.figure(figsize=(10, 6))
sns.barplot(x=[word[0] for word in four_common_words], y=[word[1] for word in four_common_words])
plt.title('Most Common Words in Sci/Tech News')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.xticks(rotation=60)
plt.show()

In [None]:
# Get title lengths for each row
df_train['Title Length'] = df_train['Title'].apply(lambda x: len(x.split()))
print(df_train['Title Length'])

# Modifying df_val for future use
df_val['Title Length'] = df_val['Title'].apply(lambda x: len(x.split()))

In [None]:
# Title length analysis

plt.figure(figsize=(10, 6))
sns.histplot(df_train['Title Length'], bins=10)
plt.title('Distribution of Title Lengths')
plt.xlabel('Title Length')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

Note that the distribution of our title lengths is right-skew

In [None]:
#get TF IDF values as a dataframe

tfidf_df = pd.DataFrame(train_title_features.toarray())

In [None]:
# Transform matrix to array, flatten, and removes zeros

tfidf_df = train_title_features.toarray().flatten()
tfidf_df = tfidf_df[tfidf_df != 0]

In [None]:
# Distribution of non-zero TF-IDF Scores

sns.histplot(tfidf_df, bins=10, kde=True)
plt.xlabel("TF-IDF Score")
plt.ylabel("Number of Words")
plt.title("Distribution of TF-IDF Scores in the Corpus")
plt.xticks(rotation=45)  # Optional: Rotate x-axis labels for long feature names
plt.show()

Note that our TF IDF distribution is right skew

In [None]:
from feature_engine.outliers import Winsorizer

In [None]:
# Plot boxplot to find skewness

sns.boxplot(df_train['Title Length'], orient='h')
plt.xlabel("Title Length")
plt.title("Boxplot of Title Length")  # Optional: Rotate x-axis labels for long feature names
plt.show()

Confirms previous image indicating right skewness

In [None]:
# Winsorize the text length to handle outliers

capper = Winsorizer(capping_method='gaussian', tail='right', fold=2)
capper.fit(df_train)

In [None]:
# Check where the right tail will be capped

capper.right_tail_caps_

In [None]:
# Transform both train and validation data frames

train_t = capper.transform(df_train)
test_t = capper.transform(df_val)

In [None]:
# Check to see if the transform worked appropriately

sns.boxplot(train_t['Title Length'], orient='h')
plt.xlabel("Title Length")
plt.title("Boxplot of Title Length")  # Optional: Rotate x-axis labels for long feature names
plt.show()

Outliers are no longer present in dataset

# Transformer Model Architecture

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, GlobalAveragePooling1D, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from transformers import DistilBertTokenizer

In [None]:
# Positional Encoding
def get_positional_encoding(seq_length, d_model):
    positions = tf.range(seq_length, dtype=tf.float32)[:, tf.newaxis]
    i = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]
    angle_rates = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
    angle_rads = positions * angle_rates
    sines = tf.math.sin(angle_rads[:, 0::2])
    cosines = tf.math.cos(angle_rads[:, 1::2])
    pos_encoding = tf.concat([sines, cosines], axis=-1)
    pos_encoding = pos_encoding[tf.newaxis, ...]
    return pos_encoding


def transformer_block(x, num_heads, d_model, dff, rate, training):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(x, x)
    attn_output = Dropout(rate)(attn_output, training=training)
    out1 = LayerNormalization(epsilon=1e-6)(x + attn_output)
    ffn_output = Dense(dff, activation='relu')(out1)
    ffn_output = Dense(d_model)(ffn_output)
    ffn_output = Dropout(rate)(ffn_output, training=training)
    return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)


# Build the Transformer Model for Text Classification
def build_model(max_len_input, vocab_size, num_heads=8, d_model=128, dff=512, rate=0.1):
    # Input
    inputs = Input(shape=(max_len_input,), name="input")
    embedding = Embedding(vocab_size, d_model, name="embedding")(inputs)
    pos_encoding = get_positional_encoding(max_len_input, d_model)
    embedding += pos_encoding

    # Transformer Encoder
    encoder_output = embedding
    for _ in range(4):
        encoder_output = transformer_block(encoder_output, num_heads, d_model, dff, rate, training=True)

    # Global Average Pooling
    pooled_output = GlobalAveragePooling1D()(encoder_output)

    # Output layer
    outputs = Dense(1, activation="sigmoid")(pooled_output)  # Binary classification, use sigmoid activation

    # Define the model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])  # Binary classification, use binary_crossentropy

    return model

# Parameters
max_len_input = 100
vocab_size = 10000

# Create the model
model = build_model(max_len_input, vocab_size)
model.summary()


In [1]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_data(input_text, tokenizer, max_len_input):
    # Tokenize and encode input text
    input_ids = tokenizer.encode(input_text, max_length=max_len_input, truncation=True)
    input_ids_padded = input_ids + [0] * (max_len_input - len(input_ids))  # Pad sequences
    return input_ids_padded
    
def predict_class(input_text, tokenizer, model, max_len_input):
    input_ids_padded = preprocess_data(input_text, tokenizer, max_len_input)
    # Convert input to tensor
    input_ids_tensor = tf.convert_to_tensor([input_ids_padded])
    # Predict using the model
    outputs = model(input_ids_tensor)
    predicted_class = tf.argmax(outputs[0]).numpy()
    return predicted_class

predicted_class = predict_class("The US is in running out of oil", tokenizer, model, max_len_input)
print(predicted_class)

NameError: name 'DistilBertTokenizer' is not defined

In [None]:
from transformers import TrainingArguments
from keras.losses import SparseCategoricalCrossentropy
import tftrainer
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DataCollatorWithPadding
from torch import cuda
from datasets import Dataset

In [None]:
# Remove title length from df_val
# Only use if the Winsorizer was utilized in Feature Analysis

# df_val = df_val.drop(['Title Length'], axis=1)

In [None]:
# Remove title length from df_train
# Only use if the Winsorizer was utilized in Feature Analysis

# df_train = df_train.drop(['Title Length'], axis=1)

In [None]:
# Rename columns

df_train.rename(columns = {'Title':'text','Class Index':'labels'}, inplace = True)
df_val.rename(columns = {'Title':'text','Class Index':'labels'}, inplace = True)

In [None]:
# Enable(?) GPU Acceleration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Initialize model and tokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4, problem_type="multi_label_classification").to(device)

In [None]:
# Example DistilBert

inputs = tokenizer("Hello, my dog is cute", padding='longest', return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
# Print logits
logits

In [None]:
# Print predicted class IDs

predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]
print(predicted_class_ids)

In [None]:
labels = torch.sum(
    torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=4), dim=1
).to(torch.float)
loss = model(**inputs, labels=labels).loss

In [None]:
# Convert df_train and df_val to a HuggingFace dataset for easier tokenization

hugging_train = Dataset.from_pandas(df_train)
hugging_val = Dataset.from_pandas(df_val)

In [None]:
hugging_train

In [None]:
hugging_val

In [None]:
def preprocess_function(examples):
    """
    Tokenize the text to create input and attention data
    
    in -> dataset (columns = text, label)
    out -> tokenized dataset (columns = text, label, input, attention)
    """
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_train = hugging_train.map(preprocess_function, batched=True)
tokenized_val = hugging_val.map(preprocess_function, batched=True)

In [None]:
tokenized_val.shape

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=1e-5,               
    logging_dir='./logs',            
    eval_steps=100,
)

trainer = transformers.Trainer(model=model,
                args=training_args,
                train_dataset=tokenized_train,
                eval_dataset=tokenized_val,
                tokenizer=tokenizer,
                data_collator=data_collator, 
                compute_metrics='sparsecategoricalcrossentropy',
                #compute_loss='sparsecategoricalcrossentropy',
                #optimizers='SGD')
                              )

In [None]:
trainer.train()

In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate = 1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)