In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Load the CSV file using a context manager
with open("drive/MyDrive/CS506-DS/midterm/data/train.csv") as f:
    df = pd.read_csv(f, nrows=5000, usecols=["Id", "Summary", "Text", "Score",'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time'])
df["Id"] = df["Id"].astype(int)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df[["Id", "Summary", "Text",'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time']], df["Score"], test_size=0.2, random_state=42)

# Define the TfidfVectorizer for character n-grams
char_vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2, 5), min_df=5, max_df=0.8)

# Define the TfidfVectorizer for word n-grams
word_vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=5, max_df=0.8)

# Combine the two vectorizers using FeatureUnion
vectorizer = FeatureUnion([("char", char_vectorizer), ("word", word_vectorizer)])

# Apply the vectorizer to the training and validation sets
X_train = vectorizer.fit_transform(X_train[['Summary', 'Text']].apply(lambda x: " ".join(x), axis=1))
X_val = vectorizer.transform(X_val[['Summary', 'Text']].apply(lambda x: " ".join(x), axis=1))

# Apply MaxAbsScaler to the feature matrices
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define the LightGBM model
params = {
    "objective": "regression",
    "metric": "rmse",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9
}

# Perform hyperparameter tuning using GridSearchCV
grid_params = {
    "num_leaves": [15, 31, 63],
    "learning_rate": [0.01, 0.05, 0.1],
    "feature_fraction": [0.5, 0.7, 0.9]
}
grid_search = GridSearchCV(lgb.LGBMRegressor(**params), grid_params, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Evaluate the model on the validation set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val_scaled)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
