In [None]:
#Author: Kai Cobb
# Purpose: Classifier test script
# SMOTE (Synthetic Minority Over-Sampling Technique)
#Approach: Applied SMOTE combined with random forest

In [1]:
# Import necessary packages
!pip install pyarrow
!pip install imblearn

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import numpy as np

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Users\kaiecobb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load dataset
df = pd.read_parquet("hugging_face_chat_data.parquet")  # Update file path

In [3]:

# Convert rankings into categorical labels
def map_ranking_to_label(rank):
    if rank < 3:
        return "low"
    elif 3 <= rank < 4:
        return "medium"
    else:
        return "high"

df['label'] = df['avg_rating'].apply(map_ranking_to_label)

In [4]:

# Preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned_prompt'] = df['prompt'].astype(str).apply(clean_text)

# Convert labels to numerical values
label_mapping = {'low': 0, 'medium': 1, 'high': 2}
df['label'] = df['label'].map(label_mapping)

In [5]:


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_prompt'], df['label'], test_size=0.2, random_state=42)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

# **Check class distribution after SMOTE**
unique, counts = np.unique(y_train_balanced, return_counts=True)
print("Class distribution after SMOTE:", dict(zip(unique, counts)))

# Train Random Forest classifier
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train_balanced, y_train_balanced)

# Evaluate the model
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=["Low", "Medium", "High"]))


Class distribution after SMOTE: {0: 4814, 1: 4814, 2: 4814}
              precision    recall  f1-score   support

         Low       0.25      0.24      0.24       305
      Medium       0.39      0.37      0.38       594
        High       0.68      0.71      0.69      1168

    accuracy                           0.54      2067
   macro avg       0.44      0.44      0.44      2067
weighted avg       0.53      0.54      0.54      2067

