In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import FeatureUnion
from imblearn.over_sampling import RandomOverSampler
import lightgbm as lgb

# Load the CSV file
df=pd.read_csv("drive/MyDrive/CS506-DS/midterm/data/train.csv")
df=df.sample(n=10000, random_state=42)
print(df.keys())

# Convert ReviewId column to int
df['Id'] = df['Id'].astype(int)
df.drop(columns=["ProductId","UserId","Time"], axis=1, inplace=True)

df['Summary'] = df['Summary'].astype(str)
df['Text'] = df['Text'].astype(str)

# Drop rows with missing values
df.dropna(subset=['Score'], inplace=True)

# Define the TfidfVectorizer for character n-grams
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=5000)

# Define the TfidfVectorizer for word n-grams
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), max_features=5000)

# Combine the two vectorizers using FeatureUnion
vectorizer = FeatureUnion([
    ('char', char_vectorizer),
    ('word', word_vectorizer)
])

# Apply the vectorizer to the Summary and Text columns
X = vectorizer.fit_transform(df[['Summary', 'Text']].apply(lambda x: ' '.join(x), axis=1))

# Apply MaxAbsScaler to the feature matrix
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)

# Apply RandomOverSampler to the feature matrix
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_scaled, df['Score'])

# Define the LightGBM model
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}
dtrain = lgb.Dataset(X_resampled, label=y_resampled)
model = lgb.train(params, dtrain, num_boost_round=100)

Index(['Id', 'ProductId', 'UserId', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Time', 'Summary', 'Text', 'Score'],
      dtype='object')
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1684597
[LightGBM] [Info] Number of data points in the train set: 24455, number of used features: 9999
[LightGBM] [Info] Start training from score 3.000000


In [4]:
X_submission = pd.read_csv("drive/MyDrive/CS506-DS/midterm/data/X_test.csv")
X_submission.drop(columns=["ProductId","UserId","Time","Score"], axis=1, inplace=True)

X_submission['Summary'] = X_submission['Summary'].astype(str)
X_submission['Text'] = X_submission['Text'].astype(str)

new_X=vectorizer.transform(X_submission['Summary'] + ' ' + X_submission['Text'])
new_X_scaled = scaler.transform(new_X)

X_submission['Score'] = model.predict(new_X_scaled)

submission = X_submission[['Id', 'Score']]
submission.to_csv("submission.csv", index=False)