In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
from tqdm import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load the data
print("Loading data...")
train_df = pd.read_csv('/Users/akshatsharma/Downloads/CS506 Midterm Fall 2024 (1)/train.csv')
test_df = pd.read_csv('/Users/akshatsharma/Downloads/CS506 Midterm Fall 2024 (1)/test.csv')

# Reduce dataset size for faster processing
train_df = train_df.sample(frac=0.5, random_state=42)

# Preprocess text
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = re.sub('<[^<]+?>', '', str(text))
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

print("Preprocessing data...")
for df in [train_df, test_df]:
    for col in ['Summary', 'Text']:
        if col not in df.columns:
            df[col] = ''
        tqdm.pandas(desc=f"Processing {col}")
        df[f'processed_{col.lower()}'] = df[col].progress_apply(preprocess_text)

# Feature engineering
def engineer_features(df):
    print("Engineering features...")
    for col in ['HelpfulnessNumerator', 'HelpfulnessDenominator']:
        if col not in df.columns:
            df[col] = 0
    df['helpfulness_ratio'] = df['HelpfulnessNumerator'] / (df['HelpfulnessDenominator'] + 1)
    df['text_length'] = df['Text'].fillna('').str.len().astype('int32')
    df['caps_ratio'] = df['Text'].fillna('').apply(lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df['exclamation_count'] = df['Text'].fillna('').str.count('!')
    return df

train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

# Clean the target variable
print("Cleaning target variable...")
train_df = train_df.dropna(subset=['Score'])
train_df['Score'] = train_df['Score'].astype(int)

# Prepare features and target
features = ['helpfulness_ratio', 'text_length', 'caps_ratio', 'exclamation_count']
X = train_df[features].astype('float32')
y = train_df['Score'] - 1  # Adjust scores to start from 0
train_text = train_df['processed_text']

# Split the data
print("Splitting data...")
X_train, X_val, y_train, y_val, train_text, val_text = train_test_split(
    X, y, train_text, test_size=0.2, random_state=42
)

# TF-IDF vectorization
print("Performing TF-IDF vectorization...")
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(train_text)
X_val_tfidf = tfidf.transform(val_text)

# Combine features
X_train_combined = hstack((X_train_tfidf, X_train)).tocsr()
X_val_combined = hstack((X_val_tfidf, X_val)).tocsr()

# Train XGBoost model
print("Training XGBoost model...")
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="mlogloss",
    n_jobs=-1  # Use all available cores
)
xgb_model.fit(X_train_combined, y_train)

# Make predictions on validation set
print("Making predictions on validation set...")
y_pred = xgb_model.predict(X_val_combined)

# Calculate accuracy and other metrics
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_val, y_pred))

# Prepare test data
print("Preparing test data...")
X_test = test_df[features].astype('float32')
X_test_tfidf = tfidf.transform(test_df['processed_text'])
X_test_combined = hstack((X_test_tfidf, X_test)).tocsr()

# Make predictions on test set
print("Making predictions on test set...")
test_predictions = xgb_model.predict(X_test_combined)

# Prepare submission file
print("Creating submission file...")
submission = pd.DataFrame({'Id': test_df['Id'], 'Score': test_predictions + 1})  # Add 1 to shift back to original score range
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")

Loading data...
Preprocessing data...


Processing Summary: 100%|██████████| 848766/848766 [00:45<00:00, 18455.88it/s]
Processing Text: 100%|██████████| 848766/848766 [14:57<00:00, 945.70it/s] 
Processing Summary: 100%|██████████| 212192/212192 [00:01<00:00, 205107.41it/s]
Processing Text: 100%|██████████| 212192/212192 [00:01<00:00, 205673.97it/s]


Engineering features...
Engineering features...
Cleaning target variable...
Splitting data...
Performing TF-IDF vectorization...
Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.



Making predictions on validation set...
Validation Accuracy: 0.6006301799648553
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.42      0.49      9186
           1       0.39      0.09      0.15      8887
           2       0.47      0.14      0.22     17568
           3       0.44      0.20      0.27     33546
           4       0.63      0.95      0.76     79342

    accuracy                           0.60    148529
   macro avg       0.50      0.36      0.38    148529
weighted avg       0.55      0.60      0.53    148529

Preparing test data...
Making predictions on test set...
Creating submission file...
Submission file created successfully.
