In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import StackingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.pipeline import Pipeline

# Load the CSV file
df=pd.read_csv("drive/MyDrive/CS506-DS/midterm/data/train.csv")
df=df.iloc[:5000]
print(df.keys())

# Convert ReviewId column to int
df['Id'] = df['Id'].astype(int)
df.drop(columns=["ProductId","UserId","Time"], axis=1, inplace=True)
df['Summary'] = df['Summary'].astype(str)
df['Text'] = df['Text'].astype(str)

# Impute missing values in numeric columns with mean
numeric_cols = ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score']
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Define the TfidfVectorizer for character n-grams
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))

# Define the TfidfVectorizer for word n-grams
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))

# Combine the two vectorizers using FeatureUnion
vectorizer = FeatureUnion([
    ('char', char_vectorizer),
    ('word', word_vectorizer)
])

# Apply the vectorizer to the Summary and Text columns
X = vectorizer.fit_transform(df[['Summary', 'Text']].apply(lambda x: ' '.join(x), axis=1))

# Define the regressors to be used in the stacking ensemble
maxabs_elasticnet = ('en', ElasticNet())
maxabs_xgb = ('xgb', XGBRegressor())
maxabs_lgb = ('lgb', lgb.LGBMRegressor())

# Define the stacking ensemble
stacking_regressor = StackingRegressor(
    estimators=[maxabs_elasticnet, maxabs_xgb, maxabs_lgb],
    final_estimator=lgb.LGBMRegressor(),
    verbose=1
)

# Define the pipeline for applying MaxAbsScaler() before the stacking ensemble
maxabs_pipeline = [('maxabs', MaxAbsScaler()), ('stacking', stacking_regressor)]
maxabs_stacking_regressor = Pipeline(maxabs_pipeline)

# Fit the stacking ensemble to the data
maxabs_stacking_regressor.fit(X, df['Score'])

# Define the transform method to apply MaxAbsScaler to the input data
def transform(self, X):
    Xs = [scl.transform(X) for _, scl in self.named_estimators_.values()]
    Xt = np.hstack(Xs)

    return Xt

# Set the transform method of the stacking regressor
stacking_regressor.transform = transform.__get__(stacking_regressor)

# Test the stacking regressor on new data
new_data = vectorizer.transform(["This is a new review", "Another new review"])
predictions = stacking_regressor.predict(new_data)
print(predictions)

Index(['Id', 'ProductId', 'UserId', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Time', 'Summary', 'Text', 'Score'],
      dtype='object')


KeyboardInterrupt: ignored

In [None]:
X_submission = pd.read_csv("drive/MyDrive/CS506-DS/midterm/data/X_test.csv")
X_submission.drop(columns=["ProductId","UserId","Score"], axis=1, inplace=True)

X_submission['Summary'] = X_submission['Summary'].astype(str)
X_submission['Text'] = X_submission['Text'].astype(str)

new_X=vectorizer.transform(X_submission['Summary'] + ' ' + X_submission['Text'])
new_X_scaled = MaxAbsScaler().transform(new_X)

X_submission['Score'] = stacking_regressor.predict(new_X_scaled)

submission = X_submission[['Id', 'Score']]
submission.to_csv("submission.csv", index=False)