In [1]:
# Cell 1: Data Loading and Feature Extraction Based on Id

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp
import numpy as np

print("Loading training and test data...")
train_data = pd.read_csv('data/train.csv')
test_ids = pd.read_csv('data/test.csv')['Id']

print(f"Train data shape: {train_data.shape}")
print(f"Test IDs shape: {test_ids.shape}")

test_data = train_data[train_data['Id'].isin(test_ids)].copy()
train_data_with_score = train_data[~train_data['Id'].isin(test_ids)].copy()

print(f"Filtered train data shape: {train_data_with_score.shape}")
print(f"Filtered test data shape: {test_data.shape}")

train_data_with_score['Text'] = train_data_with_score['Text'].fillna('').astype(str)
train_data_with_score['Summary'] = train_data_with_score['Summary'].fillna('').astype(str)
combined_text_train = train_data_with_score['Text'] + ' ' + train_data_with_score['Summary']

test_data['Text'] = test_data['Text'].fillna('').astype(str)
test_data['Summary'] = test_data['Summary'].fillna('').astype(str)
combined_text_test = test_data['Text'] + ' ' + test_data['Summary']

# extract TF-IDF and N-grams
print("Extracting TF-IDF features...")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
train_tfidf_features = tfidf.fit_transform(combined_text_train)
test_tfidf_features = tfidf.transform(combined_text_test)

print("Extracting N-grams (2-grams) features...")
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=3000, stop_words='english')
train_ngram_features = ngram_vectorizer.fit_transform(combined_text_train)
test_ngram_features = ngram_vectorizer.transform(combined_text_test)

# extract numerical features
print("Extracting numerical features...")
train_data_with_score['HelpfulnessRatio'] = train_data_with_score['HelpfulnessNumerator'] / (train_data_with_score['HelpfulnessDenominator'] + 1)
train_data_with_score['ReviewYear'] = pd.to_datetime(train_data_with_score['Time'], unit='s').dt.year
train_data_with_score['ProductReviewCount'] = train_data_with_score.groupby('ProductId')['ProductId'].transform('count')
train_data_with_score['UserReviewCount'] = train_data_with_score.groupby('UserId')['UserId'].transform('count')

test_data['HelpfulnessRatio'] = test_data['HelpfulnessNumerator'] / (test_data['HelpfulnessDenominator'] + 1)
test_data['ReviewYear'] = pd.to_datetime(test_data['Time'], unit='s').dt.year
test_data['ProductReviewCount'] = test_data.groupby('ProductId')['ProductId'].transform('count')
test_data['UserReviewCount'] = test_data.groupby('UserId')['UserId'].transform('count')

numerical_features_train = train_data_with_score[['HelpfulnessRatio', 'ReviewYear', 'ProductReviewCount', 'UserReviewCount']]
numerical_features_test = test_data[['HelpfulnessRatio', 'ReviewYear', 'ProductReviewCount', 'UserReviewCount']]

scaler = StandardScaler()
numerical_features_train_scaled = scaler.fit_transform(numerical_features_train)
numerical_features_test_scaled = scaler.transform(numerical_features_test)

print("Saving all extracted features...")
sp.save_npz('data/train_tfidf.npz', train_tfidf_features)
sp.save_npz('data/test_tfidf.npz', test_tfidf_features)
sp.save_npz('data/train_ngram.npz', train_ngram_features)
sp.save_npz('data/test_ngram.npz', test_ngram_features)

np.save('data/train_numerical_features.npy', numerical_features_train_scaled)
np.save('data/test_numerical_features.npy', numerical_features_test_scaled)

train_data_with_score.to_csv('data/processed_train_with_features.csv', index=False)
test_data.to_csv('data/processed_test_with_features.csv', index=False)

print("All features and processed data have been saved successfully.")

Loading training data...
Train data shape: (1697533, 9)
Dropping rows with missing scores in training data...
Extracting TF-IDF features from training data...
Extracting N-grams (2-grams) features from training data...
Saving all extracted features...
All features and processed data have been saved successfully.


In [1]:
# Cell 2: Combine Features and Prepare for Training
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp
import numpy as np
from scipy.sparse import hstack, csr_matrix

print("Loading saved features...")
train_tfidf = sp.load_npz('data/train_tfidf.npz')
test_tfidf = sp.load_npz('data/test_tfidf.npz')
train_ngram = sp.load_npz('data/train_ngram.npz')
test_ngram = sp.load_npz('data/test_ngram.npz')

train_numerical = np.load('data/train_numerical_features.npy')
test_numerical = np.load('data/test_numerical_features.npy')

def combine_features(tfidf, ngram, numerical):
    return hstack([tfidf, ngram, csr_matrix(numerical)])

X_train = combine_features(train_tfidf, train_ngram, train_numerical)
X_test = combine_features(test_tfidf, test_ngram, test_numerical)

train_data_with_score = pd.read_csv('data/processed_train_with_features.csv')
y_train = train_data_with_score['Score'].values

print("Feature split completed.")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Loading saved features...
Feature split completed.
Training set shape: (1485341, 8004)
Test set shape: (212192, 8004)


In [None]:
# Cell 3: Train and Evaluate Models with TruncatedSVD and Feature Saving

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib
import time
import logging
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.decomposition import TruncatedSVD
import numpy as np

logging.basicConfig(filename='training_log.log', level=logging.INFO, format='%(asctime)s - %(message)s')

def evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    end_time = time.time()
    elapsed_time = end_time - start_time
    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    
    print(f"--- {model_name} ---")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    
    logging.info(f"Model: {model_name}")
    logging.info(f"Training Accuracy: {train_accuracy:.4f}")
    logging.info(f"Validation Accuracy: {val_accuracy:.4f}")
    logging.info(f"F1 Score: {f1:.4f}")
    logging.info(f"Training Time: {elapsed_time:.2f} seconds")
    logging.info(f"Confusion Matrix:\n{conf_matrix}\n")
    
    joblib.dump(model, f'models/{model_name}.joblib')
    print(f"{model_name} saved successfully.\n")

# Apply TruncatedSVD to reduce dimensionality
def apply_truncated_svd(X_train, X_val, n_components, save=True):
    svd = TruncatedSVD(n_components=n_components)
    X_train_svd = svd.fit_transform(X_train)
    X_val_svd = svd.transform(X_val)
    
    print(f"Applied TruncatedSVD with {n_components} components.")
    if save:
        np.save('data/X_train_svd.npy', X_train_svd)
        np.save('data/X_val_svd.npy', X_val_svd)
        print("SVD features saved successfully.")
    
    return X_train_svd, X_val_svd

def load_svd_features():
    X_train_svd = np.load('data/X_train_svd.npy')
    X_val_svd = np.load('data/X_val_svd.npy')
    return X_train_svd, X_val_svd

from sklearn.model_selection import train_test_split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

try:
    X_train_split_svd, X_val_split_svd = load_svd_features()
    print("Loaded pre-saved SVD features.")
except FileNotFoundError:
    svd_n = 10
    X_train_split_svd, X_val_split_svd = apply_truncated_svd(X_train_split, X_val_split, svd_n)

# Train and evaluate models

# # Naive Bayes
# nb_model = MultinomialNB()
# evaluate_model(nb_model, X_train_split_svd, y_train_split, X_val_split_svd, y_val_split, f'naive_bayes_svd_{svd_n}')

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
evaluate_model(lr_model, X_train_split_svd, y_train_split, X_val_split_svd, y_val_split, f'logistic_regression_svd_{svd_n}')

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
evaluate_model(rf_model, X_train_split_svd, y_train_split, X_val_split_svd, y_val_split, f'random_forest_svd_{svd_n}')

Applied TruncatedSVD with 10 components.
SVD features saved successfully.
--- logistic_regression_svd_10 ---
Training Accuracy: 0.5342
Validation Accuracy: 0.5363
F1 Score: 0.4031
Confusion Matrix:
[[     1      1     13    688  17371]
 [     0      1     21   1278  16304]
 [     0      0     46   2816  32317]
 [     0      1     37   5212  61877]
 [     2      5     37   4992 154049]]
logistic_regression_svd_10 saved successfully.

--- random_forest_svd_10 ---
Training Accuracy: 1.0000
Validation Accuracy: 0.5504
F1 Score: 0.4812
Confusion Matrix:
[[  2834    614   1219   2323  11084]
 [  1412    571   1725   3616  10280]
 [  1069    620   3004   8459  22027]
 [   711    431   2984  14719  48282]
 [  1078    415   2255  12959 142378]]
random_forest_svd_10 saved successfully.



Parameters: { "use_label_encoder" } are not used.



--- xgboost_svd_10 ---
Training Accuracy: 0.5521
Validation Accuracy: 0.5516
F1 Score: 0.4487
Confusion Matrix:
[[  1195      4    382   1696  14797]
 [   579      0    512   3075  13438]
 [   417      2    811   7174  26775]
 [   264      2    542  11445  54874]
 [   386      1    355   7944 150399]]
xgboost_svd_10 saved successfully.



In [5]:
# Cell 4: Predict on Test Data and Generate Submission File

import pandas as pd
import joblib

X_test = combine_features(test_tfidf, test_ngram, test_numerical)

best_model = joblib.load('models/linear_regression.joblib')
print("Making predictions on test data...")
test_predictions = best_model.predict(X_test)

test_data = pd.read_csv('data/test.csv')
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'Score': test_predictions
})

submission = submission.sort_values('Id')
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

Found 0 matching Ids in train_data
