In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
import lightgbm as lgb
from scipy.sparse import csr_matrix
import joblib

# Load the CSV file
df = pd.read_csv("drive/MyDrive/CS506-DS/midterm/data/train.csv")
df = df.sample(n=20000, random_state=42)
print(df.keys())

# Convert ReviewId column to int
df['Id'] = df['Id'].astype(int)
df.drop(columns=["ProductId","UserId","Time"], axis=1, inplace=True)

df['Summary'] = df['Summary'].astype(str)
df['Text'] = df['Text'].astype(str)

# Define the SimpleImputer for filling missing values with the maximum value
imputer = SimpleImputer(strategy='most_frequent')
XI=df.copy()
XI.drop(columns=["Text","Summary"], axis=1, inplace=True)

# Apply the imputer to the Id, HelpfulnessNumerator, and HelpfulnessDenominator columns
X_imputed = imputer.fit_transform(XI[['Id', 'HelpfulnessNumerator', 'HelpfulnessDenominator']])
df[['Id', 'HelpfulnessNumerator', 'HelpfulnessDenominator']] = X_imputed

# Define the TfidfVectorizer for character n-grams
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))

# Define the TfidfVectorizer for word n-grams
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))

# Combine the two vectorizers using FeatureUnion
vectorizer = FeatureUnion([
    ('char', char_vectorizer),
    ('word', word_vectorizer)
])

# Apply the vectorizer to the Summary and Text columns
X = vectorizer.fit_transform(df[['Summary', 'Text']].apply(lambda x: ' '.join(x), axis=1))

# Apply MaxAbsScaler to the feature matrix
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)


Index(['Id', 'ProductId', 'UserId', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Time', 'Summary', 'Text', 'Score'],
      dtype='object')


In [13]:
# Define the LightGBM model
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31, #41 didnt work
    'learning_rate': 0.05, #0.03 didnt work
    'feature_fraction': 0.9
}
dtrain = lgb.Dataset(X_scaled, label=df['Score'])
model = lgb.train(params, dtrain, num_boost_round=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4488081
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 77000
[LightGBM] [Info] Start training from score 3.713600


In [14]:
# Save the trained model
joblib.dump(model, 'lgb_model.pkl')

['lgb_model.pkl']

In [15]:
X_submission = pd.read_csv("drive/MyDrive/CS506-DS/midterm/data/X_test.csv")
X_submission.drop(columns=["ProductId","UserId","Time","Score"], axis=1, inplace=True)

X_submission['Summary'] = X_submission['Summary'].astype(str)
X_submission['Text'] = X_submission['Text'].astype(str)

X_imputed = imputer.fit_transform(X_submission[['Id', 'HelpfulnessNumerator', 'HelpfulnessDenominator']])
X_submission[['Id', 'HelpfulnessNumerator', 'HelpfulnessDenominator']] = X_imputed

new_X=vectorizer.transform(X_submission['Summary'] + ' ' + X_submission['Text'])
new_X_scaled = scaler.transform(new_X)

X_submission['Score'] = model.predict(new_X_scaled)

submission = X_submission[['Id', 'Score']]
submission.to_csv("submission.csv", index=False)
