In [11]:
#Author: Kai Cobb
# Purpose: Classifier test script
# Regular logistic regression classifier

In [20]:
# Import necessary packages
!pip install pyarrow
!pip install xgboost

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from scipy.sparse import hstack

# Download NLTK stopwords if not already available
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))





[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Users\kaiecobb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# Loading the data
# Load dataset from Parquet file
df = pd.read_parquet("hugging_face_chat_data.parquet")  # Update with correct file path

# Display first few rows to inspect column names
print(df.head())


                                              prompt  \
0  Provide step-by-step instructions on how to ma...   
1  Write a personal essay of at least 1000 words ...   
2  In this research, we aim to investigate how te...   
3  Did Karl Marx's theories on centralizing credi...   
4  alter this api that gets a request like: {"0",...   

                                             quality  \
0  [{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb39...   
1  [{'user_id': '6621c3f0-1af2-4d75-acda-ed9c78b9...   
2  [{'user_id': '2e6dda25-0a99-45aa-a02d-65f426d6...   
3  [{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb39...   
4  [{'user_id': '99a4bc7d-3e95-4c18-a8f1-26043abf...   

                                            metadata  avg_rating  \
0  {"source": "ultrachat", "kind": "synthetic", "...    5.000000   
1  {"source": "ultrachat", "kind": "synthetic", "...    2.750000   
2  {"source": "evol_instruct", "kind": "synthetic...    3.000000   
3  {"source": "OpenAssistant/oasst2", "kind": "hu...  

In [22]:
#Read df
df

Unnamed: 0,prompt,quality,metadata,avg_rating,num_responses,agreement_ratio,raw_responses,kind,cluster_description,topic
0,Provide step-by-step instructions on how to ma...,[{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb39...,"{""source"": ""ultrachat"", ""kind"": ""synthetic"", ""...",5.000000,2,1.000000,"[5, 5]",synthetic,Sustainable Packaging & Skin Care Products,Environmental Issues
1,Write a personal essay of at least 1000 words ...,[{'user_id': '6621c3f0-1af2-4d75-acda-ed9c78b9...,"{""source"": ""ultrachat"", ""kind"": ""synthetic"", ""...",2.750000,4,0.687500,"[2, 3, 3, 3]",synthetic,Educational Technology & Cybersecurity in Fash...,Science and Technology
2,"In this research, we aim to investigate how te...",[{'user_id': '2e6dda25-0a99-45aa-a02d-65f426d6...,"{""source"": ""evol_instruct"", ""kind"": ""synthetic...",3.000000,3,0.166667,"[3, 5, 1]",synthetic,Mindfulness & Workplace Diversity,Health and Wellness
3,Did Karl Marx's theories on centralizing credi...,[{'user_id': 'd23b12c2-b601-490e-b5b3-2040eb39...,"{""source"": ""OpenAssistant/oasst2"", ""kind"": ""hu...",3.500000,2,0.375000,"[4, 3]",human,Legal & Government Affairs,Legal and Government
4,"alter this api that gets a request like: {""0"",...",[{'user_id': '99a4bc7d-3e95-4c18-a8f1-26043abf...,"{""source"": ""ewof/sharegpt-instruct-unfiltered-...",3.666667,3,0.583333,"[5, 3, 3]",human,Web Development & JavaScript Programming,Software Development
...,...,...,...,...,...,...,...,...,...,...
10326,"show me how to set iam user, group and policie...",[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",3.000000,1,1.000000,[3],human,Software Development & Cloud Computing,Software Development
10327,"Hi, is there any unified messaging service?\nA...",[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",2.000000,2,0.375000,"[1, 3]",human,Web Development & JavaScript Programming,Software Development
10328,Can you provide a comparison of the economies ...,[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""synthetic"", ""s...",4.000000,1,1.000000,[4],synthetic,Legal & Government Affairs,Legal and Government
10329,forget about any prior conversations,[{'user_id': 'e2bdd868-f28e-46fc-9254-a6ec1e29...,"{""evolved_from"": null, ""kind"": ""human"", ""sourc...",2.000000,2,0.375000,"[1, 3]",human,Job Application & Customer Management,Others


In [23]:
# Ensure the dataset contains the correct columns (adjust names if needed)
expected_columns = ["prompt", "avg_rating"]  # Update if actual column names are different
for col in expected_columns:
    if col not in df.columns:
        raise ValueError(f"Missing expected column: {col}")


In [24]:
# Convert rankings into categorical labels: Low, Medium, High
def map_ranking_to_label(rank):
    if rank < 3:
        return "low"
    elif 3 <= rank < 4:
        return "medium"
    else:
        return "high"

df['label'] = df['avg_rating'].apply(map_ranking_to_label)

In [25]:
# Preprocess text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df['cleaned_prompt'] = df['prompt'].astype(str).apply(clean_text)

In [26]:
# Convert labels to numerical values
label_mapping = {'low': 0, 'medium': 1, 'high': 2}
df['label'] = df['label'].map(label_mapping)

In [27]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_prompt'], df['label'], test_size=0.2, random_state=42)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Text length feature
def text_length_feature(X):
    return np.array([len(text.split()) for text in X]).reshape(-1, 1)

X_train_len = text_length_feature(X_train)
X_test_len = text_length_feature(X_test)

# Combine TF-IDF and text length features
X_train_combined = hstack( [X_train_tfidf, X_train_len])
X_test_combined = hstack( [X_test_tfidf, X_test_len])

# Print Class Distribution After Splitting
print("Class Distribution After Split:", pd.Series(y_train).value_counts())

# Apply Random Undersampling (AFTER SPLITTING)
undersample = RandomUnderSampler(sampling_strategy="auto", random_state=42)
X_train_resampled, y_train_resampled = undersample.fit_resample(X_train_combined, y_train)

#Calculate class weights
class_counts = y_train.value_counts()
total_samples = len(y_train)
class_weights = {label: total_samples / count for label, count in class_counts.items()}

# Calculate the scale_pos_weight as the ratio of the majority class to the minority class
# This can help handle class imbalance by giving higher weights to underrepresented classes
scale_pos_weight = class_weights[0] / class_weights[2] # For example use low and high classes

# Train XGBoost with class weights
xgb = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", scale_pos_weight=scale_pos_weight)
xgb.fit(X_train_resampled, y_train_resampled)

# predict
y_pred = xgb.predict(X_test_combined)


# Evaluate the model
print(classification_report(y_test, y_pred, target_names=["Low", "Medium", "High"]))

Class Distribution After Split: label
2    4814
1    2349
0    1101
Name: count, dtype: int64


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



              precision    recall  f1-score   support

         Low       0.24      0.38      0.30       305
      Medium       0.38      0.48      0.43       594
        High       0.71      0.51      0.59      1168

    accuracy                           0.48      2067
   macro avg       0.45      0.46      0.44      2067
weighted avg       0.55      0.48      0.50      2067

