In [11]:
# Cell 1: Data Loading and Feature Extraction Based on Id

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp
import numpy as np

print("Loading training and test data...")
train_data = pd.read_csv('data/train.csv')
test_ids = pd.read_csv('data/test.csv')['Id']

print(f"Train data shape: {train_data.shape}")
print(f"Test IDs shape: {test_ids.shape}")

test_data = train_data[train_data['Id'].isin(test_ids)].copy()
train_data_with_score = train_data[~train_data['Id'].isin(test_ids)].copy()

print(f"Filtered train data shape: {train_data_with_score.shape}")
print(f"Filtered test data shape: {test_data.shape}")

train_data_with_score['Text'] = train_data_with_score['Text'].fillna('').astype(str)
train_data_with_score['Summary'] = train_data_with_score['Summary'].fillna('').astype(str)
combined_text_train = train_data_with_score['Text'] + ' ' + train_data_with_score['Summary']

test_data['Text'] = test_data['Text'].fillna('').astype(str)
test_data['Summary'] = test_data['Summary'].fillna('').astype(str)
combined_text_test = test_data['Text'] + ' ' + test_data['Summary']

print("Extracting TF-IDF features...")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
train_tfidf_features = tfidf.fit_transform(combined_text_train)
test_tfidf_features = tfidf.transform(combined_text_test)

print("Extracting N-grams (2-grams) features...")
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=3000, stop_words='english')
train_ngram_features = ngram_vectorizer.fit_transform(combined_text_train)
test_ngram_features = ngram_vectorizer.transform(combined_text_test)

print("Extracting numerical features...")
train_data_with_score['HelpfulnessRatio'] = train_data_with_score['HelpfulnessNumerator'] / (train_data_with_score['HelpfulnessDenominator'] + 1)
train_data_with_score['ReviewYear'] = pd.to_datetime(train_data_with_score['Time'], unit='s').dt.year
train_data_with_score['ProductReviewCount'] = train_data_with_score.groupby('ProductId')['ProductId'].transform('count')
train_data_with_score['UserReviewCount'] = train_data_with_score.groupby('UserId')['UserId'].transform('count')

test_data['HelpfulnessRatio'] = test_data['HelpfulnessNumerator'] / (test_data['HelpfulnessDenominator'] + 1)
test_data['ReviewYear'] = pd.to_datetime(test_data['Time'], unit='s').dt.year
test_data['ProductReviewCount'] = test_data.groupby('ProductId')['ProductId'].transform('count')
test_data['UserReviewCount'] = test_data.groupby('UserId')['UserId'].transform('count')

numerical_features_train = train_data_with_score[['HelpfulnessRatio', 'ReviewYear', 'ProductReviewCount', 'UserReviewCount']]
numerical_features_test = test_data[['HelpfulnessRatio', 'ReviewYear', 'ProductReviewCount', 'UserReviewCount']]

scaler = StandardScaler()
numerical_features_train_scaled = scaler.fit_transform(numerical_features_train)
numerical_features_test_scaled = scaler.transform(numerical_features_test)

print("Saving all extracted features...")
sp.save_npz('data/train_tfidf.npz', train_tfidf_features)
sp.save_npz('data/test_tfidf.npz', test_tfidf_features)
sp.save_npz('data/train_ngram.npz', train_ngram_features)
sp.save_npz('data/test_ngram.npz', test_ngram_features)

np.save('data/train_numerical_features.npy', numerical_features_train_scaled)
np.save('data/test_numerical_features.npy', numerical_features_test_scaled)

train_data_with_score.to_csv('data/processed_train_with_features.csv', index=False)
test_data.to_csv('data/processed_test_with_features.csv', index=False)

print("All features and processed data have been saved successfully.")

Loading training and test data...
Train data shape: (1697533, 9)
Test IDs shape: (212192,)
Filtered train data shape: (1485341, 9)
Filtered test data shape: (212192, 9)
Extracting TF-IDF features...
Extracting N-grams (2-grams) features...
Extracting numerical features...
Saving all extracted features...
All features and processed data have been saved successfully.


In [12]:
# Cell 2: Combine Features and Prepare for Training

from scipy.sparse import hstack, csr_matrix

print("Loading saved features...")
train_tfidf = sp.load_npz('data/train_tfidf.npz')
test_tfidf = sp.load_npz('data/test_tfidf.npz')
train_ngram = sp.load_npz('data/train_ngram.npz')
test_ngram = sp.load_npz('data/test_ngram.npz')

train_numerical = np.load('data/train_numerical_features.npy')
test_numerical = np.load('data/test_numerical_features.npy')

def combine_features(tfidf, ngram, numerical):
    return hstack([tfidf, ngram, csr_matrix(numerical)])

X_train = combine_features(train_tfidf, train_ngram, train_numerical)
X_test = combine_features(test_tfidf, test_ngram, test_numerical)

train_data_with_score = pd.read_csv('data/processed_train_with_features.csv')
y_train = train_data_with_score['Score'].values

print("Feature split completed.")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Loading saved features...
Feature split completed.
Training set shape: (1485341, 8004)
Test set shape: (212192, 8004)


In [16]:
# Cell 3: Evaluate Trained Models on Validation Set

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib

def evaluate_model(model, X_val, y_val, model_name):
    # 预测验证集
    y_val_pred = model.predict(X_val)
    
    print(f"--- {model_name} ---")
    print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
    print(f"Precision: {precision_score(y_val, y_val_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_val, y_val_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_val, y_val_pred, average='weighted'):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_val_pred))
    print(f"{model_name} evaluation completed.\n")

train_data_with_score = pd.read_csv('data/processed_train_with_features.csv')
y_train = train_data_with_score['Score'].values

from sklearn.model_selection import train_test_split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# Naive Bayes
# nb_model = joblib.load('models/naive_bayes.joblib')
# evaluate_model(nb_model, X_val_split, y_val_split, 'naive_bayes')

# Linear Regression
lr_model = joblib.load('models/logistic_regression.joblib')
evaluate_model(lr_model, X_val_split, y_val_split, 'logistic_regression')

# Random Forest
# rf_model = joblib.load('models/random_forest.joblib')
# evaluate_model(rf_model, X_val_split, y_val_split, 'random_forest')

--- logistic_regression ---
Validation Accuracy: 0.6646
Precision: 0.6367
Recall: 0.6646
F1 Score: 0.6413
Confusion Matrix:
[[ 11097   2411   1282    511   2773]
 [  3366   5370   4646   1489   2733]
 [  1330   2662  13526   8688   8973]
 [   702    677   5723  23949  36076]
 [  1119    419   1998  12064 143485]]
logistic_regression evaluation completed.



In [17]:
# Cell 4: Predict on Test Data and Generate Submission File

import pandas as pd
import joblib

X_test = combine_features(test_tfidf, test_ngram, test_numerical)
best_model = joblib.load('models/logistic_regression.joblib')
print("Making predictions on test data...")
test_predictions = best_model.predict(X_test)
test_data = pd.read_csv('data/test.csv')
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'Score': test_predictions
})

submission = submission.sort_values('Id')
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

Making predictions on test data...
Submission file saved as 'submission.csv'.
