# Libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import scipy.sparse as sp
from xgboost import XGBRegressor
import pickle


In [17]:
print("Loading training and test data...")
train_data = pd.read_csv('data/train.csv')
test_ids = pd.read_csv('data/test.csv')['Id']

print(f"Train data shape: {train_data.shape}")
print(f"Test IDs shape: {test_ids.shape}")

# Step 1: Data Split
test_data = train_data[train_data['Id'].isin(test_ids)].copy()
train_data_with_score = train_data[~train_data['Id'].isin(test_ids)].copy()

print(f"Filtered train data shape: {train_data_with_score.shape}")
print(f"Filtered test data shape: {test_data.shape}")

# Step 2: TF-IDF and N-grams
train_data_with_score['Text'] = train_data_with_score['Text'].fillna('').astype(str)
train_data_with_score['Summary'] = train_data_with_score['Summary'].fillna('').astype(str)
combined_text_train = train_data_with_score['Text'] + ' ' + train_data_with_score['Summary']

test_data['Text'] = test_data['Text'].fillna('').astype(str)
test_data['Summary'] = test_data['Summary'].fillna('').astype(str)
combined_text_test = test_data['Text'] + ' ' + test_data['Summary']

print("Extracting TF-IDF features...")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
train_tfidf_features = tfidf.fit_transform(combined_text_train)
test_tfidf_features = tfidf.transform(combined_text_test)

print("Extracting N-grams (2-grams) features")
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=3000, stop_words='english')
train_ngram_features = ngram_vectorizer.fit_transform(combined_text_train)
test_ngram_features = ngram_vectorizer.transform(combined_text_test)

# Step 5: numerical features
print("Extracting numerical features")
train_data_with_score['HelpfulnessRatio'] = train_data_with_score['HelpfulnessNumerator'] / (train_data_with_score['HelpfulnessDenominator'] + 1)
train_data_with_score['ReviewYear'] = pd.to_datetime(train_data_with_score['Time'], unit='s').dt.year
train_data_with_score['TextLength'] = train_data_with_score['Text'].apply(len)
train_data_with_score['SummaryLength'] = train_data_with_score['Summary'].apply(len)

test_data['HelpfulnessRatio'] = test_data['HelpfulnessNumerator'] / (test_data['HelpfulnessDenominator'] + 1)
test_data['ReviewYear'] = pd.to_datetime(test_data['Time'], unit='s').dt.year
test_data['TextLength'] = test_data['Text'].apply(len)
test_data['SummaryLength'] = test_data['Summary'].apply(len)

numerical_features_train = train_data_with_score[['HelpfulnessRatio', 'ReviewYear', 'TextLength', 'SummaryLength']]
numerical_features_test = test_data[['HelpfulnessRatio', 'ReviewYear', 'TextLength', 'SummaryLength']]

# normalize
scaler = StandardScaler()
numerical_features_train_scaled = scaler.fit_transform(numerical_features_train)
numerical_features_test_scaled = scaler.transform(numerical_features_test)

# save features
print("Saving all extracted features...")
sp.save_npz('data/processed_train_tfidf.npz', train_tfidf_features)
sp.save_npz('data/processed_test_tfidf.npz', test_tfidf_features)
sp.save_npz('data/processed_train_ngram.npz', train_ngram_features)
sp.save_npz('data/processed_test_ngram.npz', test_ngram_features)

np.save('data/processed_train_numerical_features.npy', numerical_features_train_scaled)
np.save('data/processed_test_numerical_features.npy', numerical_features_test_scaled)
np.save('data/processed_y_train.npy', train_data_with_score['Score'].values)

# 保存处理后的训练和测试数据
train_data_with_score.to_csv('data/processed_train_data_with_features.csv', index=False)
test_data.to_csv('data/processed_test_data_with_features.csv', index=False)

print("All features and processed data have been saved successfully.")

Loading training and test data...
Train data shape: (1697533, 9)
Test IDs shape: (212192,)
Filtered train data shape: (1485341, 9)
Filtered test data shape: (212192, 9)
Extracting TF-IDF features...
Extracting N-grams (2-grams) features
Extracting numerical features
Saving all extracted features...
All features and processed data have been saved successfully.


In [11]:
from scipy.sparse import hstack, csr_matrix

print("Loading saved features...")
train_tfidf = sp.load_npz('data/processed_train_tfidf.npz')
test_tfidf = sp.load_npz('data/processed_test_tfidf.npz')
train_numerical = np.load('data/processed_train_numerical_features.npy')
test_numerical = np.load('data/processed_test_numerical_features.npy')
y_train = np.load('data/processed_y_train.npy')

def combine_features(tfidf, numerical):
    return hstack([tfidf, csr_matrix(numerical)])

X_train = combine_features(train_tfidf, train_numerical)
X_test = combine_features(test_tfidf, test_numerical)

assert X_train.shape[0] == y_train.shape[0], "Mismatch in number of samples between X_train and y_train"

print("Feature split completed.")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


Loading saved features...
Feature split completed.
Training set shape: (1485341, 5004)
Test set shape: (212192, 5004)


# Adding Features

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# train Logistic Regression 
print("Training Logistic Regression model")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_split, y_train_split)


# train XGBoost 
# print("Training XGBoost model")
# xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')
# xgb_model.fit(X_train_split, y_train_split)


Training Logistic Regression model


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Feature Selection

In [13]:
def evaluate_model(model, X_val, y_val, model_name):
    y_val_pred = model.predict(X_val)
    y_val_pred_rounded = np.clip(np.round(y_val_pred), 1, 5).astype(int)
    
    print(f"--- {model_name} ---")
    print(f"Validation RMSE: {mean_squared_error(y_val, y_val_pred, squared=False):.4f}")
    print(f"Accuracy: {accuracy_score(y_val, y_val_pred_rounded):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_val_pred_rounded))
    print(f"{model_name} evaluation completed.\n")

print("Evaluating models...")
evaluate_model(lr_model, X_val_split, y_val_split, 'Logistic Regression')
# evaluate_model(xgb_model, X_val_split, y_val_split, 'XGBoost')

# 保存模型
import joblib
print("Saving models")
joblib.dump(lr_model, 'models/logistic_regression.joblib')
# joblib.dump(xgb_model, 'models/xgboost.joblib')
print("All models have been saved successfully.")

Evaluating models...
--- Logistic Regression ---
Validation RMSE: 0.9341
Accuracy: 0.6472
Confusion Matrix:
[[ 10471   2592   1403    535   3073]
 [  3461   4493   4913   1636   3101]
 [  1385   2619  12163   9131   9881]
 [   791    641   5802  22039  37854]
 [  1216    410   2177  12182 143100]]
Logistic Regression evaluation completed.

Saving models...
All models have been saved successfully.


In [16]:
test_predictions = lr_model.predict(X_test)
test_predictions_rounded = np.clip(np.round(test_predictions), 1, 5).astype(int)

submission = pd.DataFrame({
    'Id': test_data['Id'],
    'Score': test_predictions_rounded
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")

Submission file saved as 'submission.csv'.
