In [None]:
#Author: Kai Cobb
# Purpose: Classifier test script
# XGBoost with class weights
#Approach: Assigned higher weights to Low and Medium classes in XGBoost.


In [1]:
# Import necessary packages
!pip install pyarrow
!pip install imblearn
!pip install xgboost

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from scipy.sparse import hstack

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Load dataset
df = pd.read_parquet("hugging_face_chat_data.parquet")  # Update file path






[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Users\kaiecobb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Convert rankings into categorical labels
def map_ranking_to_label(rank):
    if rank < 3:
        return "low"
    elif 3 <= rank < 4:
        return "medium"
    else:
        return "high"

df['label'] = df['avg_rating'].apply(map_ranking_to_label)

In [3]:
# Preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned_prompt'] = df['prompt'].astype(str).apply(clean_text)

# Convert labels to numerical values
label_mapping = {'low': 0, 'medium': 1, 'high': 2}
df['label'] = df['label'].map(label_mapping)



In [4]:
X = df["cleaned_prompt"]
y = df["label"]

In [5]:
df

Unnamed: 0,prompt,quality,metadata,avg_rating,num_responses,agreement_ratio,raw_responses,kind,cluster_description,topic,label,cleaned_prompt
0,Provide step-by-step instructions on how to ma...,[{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb39...,"{""source"": ""ultrachat"", ""kind"": ""synthetic"", ""...",5.000000,2,1.000000,"[5, 5]",synthetic,Sustainable Packaging & Skin Care Products,Environmental Issues,2,provide step step instructions make safe effec...
1,Write a personal essay of at least 1000 words ...,[{'user_id': '6621c3f0-1af2-4d75-acda-ed9c78b9...,"{""source"": ""ultrachat"", ""kind"": ""synthetic"", ""...",2.750000,4,0.687500,"[2, 3, 3, 3]",synthetic,Educational Technology & Cybersecurity in Fash...,Science and Technology,0,write personal essay least 1000 words discussi...
2,"In this research, we aim to investigate how te...",[{'user_id': '2e6dda25-0a99-45aa-a02d-65f426d6...,"{""source"": ""evol_instruct"", ""kind"": ""synthetic...",3.000000,3,0.166667,"[3, 5, 1]",synthetic,Mindfulness & Workplace Diversity,Health and Wellness,1,research aim investigate technology moderate c...
3,Did Karl Marx's theories on centralizing credi...,[{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb39...,"{""source"": ""OpenAssistant/oasst2"", ""kind"": ""hu...",3.500000,2,0.375000,"[4, 3]",human,Legal & Government Affairs,Legal and Government,1,karl marx theories centralizing credit anythin...
4,"alter this api that gets a request like: {""0"",...",[{'user_id': '99a4bc7d-3e95-4c18-a8f1-26043abf...,"{""source"": ""ewof/sharegpt-instruct-unfiltered-...",3.666667,3,0.583333,"[5, 3, 3]",human,Web Development & JavaScript Programming,Software Development,1,alter api gets request like 0 5 2 3 5 5 1 4 4 ...
...,...,...,...,...,...,...,...,...,...,...,...,...
10326,"show me how to set iam user, group and policie...",[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",3.000000,1,1.000000,[3],human,Software Development & Cloud Computing,Software Development,1,show set iam user group policies deploying lam...
10327,"Hi, is there any unified messaging service?\nA...",[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",2.000000,2,0.375000,"[1, 3]",human,Web Development & JavaScript Programming,Software Development,0,hi unified messaging service answer english 지금...
10328,Can you provide a comparison of the economies ...,[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""synthetic"", ""s...",4.000000,1,1.000000,[4],synthetic,Legal & Government Affairs,Legal and Government,2,provide comparison economies united states chi...
10329,forget about any prior conversations,[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",2.000000,2,0.375000,"[1, 3]",human,Job Application & Customer Management,Others,0,forget prior conversations


In [6]:
# Print Class Distribution Before Splitting
print("Class Distribution Before Split:", y.value_counts())

Class Distribution Before Split: label
2    5982
1    2943
0    1406
Name: count, dtype: int64


In [7]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [8]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Text length feature
def text_length_feature(X):
    return np.array([len(text.split()) for text in X]).reshape(-1, 1)

X_train_len = text_length_feature(X_train)
X_test_len = text_length_feature(X_test)


In [9]:
# Combine TF-IDF and text length features
X_train_combined = hstack( [X_train_tfidf, X_train_len])
X_test_combined = hstack( [X_test_tfidf, X_test_len])

# Print Class Distribution After Splitting
print("Class Distribution After Split:", pd.Series(y_train).value_counts())


Class Distribution After Split: label
2    4785
1    2354
0    1125
Name: count, dtype: int64


In [10]:
# Apply Random Undersampling (AFTER SPLITTING)
undersample = RandomUnderSampler(sampling_strategy="auto", random_state=42)
X_train_resampled, y_train_resampled = undersample.fit_resample(X_train_combined, y_train)

# Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
xgb.fit(X_train_resampled, y_train_resampled)

# Predict
y_pred = xgb.predict(X_test_combined)

# Evaluate Model
print(classification_report(y_test, y_pred, target_names=["Low", "Medium", "High"]))

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

         Low       0.21      0.38      0.27       281
      Medium       0.38      0.47      0.42       589
        High       0.74      0.52      0.61      1197

    accuracy                           0.49      2067
   macro avg       0.45      0.46      0.43      2067
weighted avg       0.57      0.49      0.51      2067

