In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import FeatureUnion
import lightgbm as lgb
from sklearn.linear_model import ElasticNet

# Load the CSV file
df=pd.read_csv("drive/MyDrive/CS506-DS/midterm/data/train.csv")
df=df.iloc[:30000]
print(df.keys())

# Convert ReviewId column to int
df['Id'] = df['Id'].astype(int)
df['Summary'] = df['Summary'].astype(str)
df['Text'] = df['Text'].astype(str)

# Define the TfidfVectorizer for character n-grams
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=5000)

# Define the TfidfVectorizer for word n-grams
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), max_features=5000)

# Combine the two vectorizers using FeatureUnion
vectorizer = FeatureUnion([
    ('char', char_vectorizer),
    ('word', word_vectorizer)
])

# Apply the vectorizer to the Summary and Text columns
X_text = vectorizer.fit_transform(df[['Summary', 'Text']].apply(lambda x: ' '.join(x), axis=1)).toarray()
print(len(X_text))

# Apply MaxAbsScaler to the feature matrix
scaler = MaxAbsScaler()
X_text_scaled = scaler.fit_transform(X_text)

# Select the columns to be included in the model
df['HelpfulnessRatio'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
X_other = df[['HelpfulnessRatio']].values

# Apply MaxAbsScaler to the other feature matrix
X_other_scaled = scaler.fit_transform(X_other)

# Concatenate the two feature matrices
X = np.concatenate((X_text_scaled, X_other_scaled), axis=1)

# Convert all columns in X to numeric data types
X = X.astype(float)

# Define the LightGBM model
model = ElasticNet(alpha=0.1, l1_ratio=0.5)

Index(['Id', 'ProductId', 'UserId', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Time', 'Summary', 'Text', 'Score'],
      dtype='object')
35000


In [3]:
X_submission = pd.read_csv("drive/MyDrive/CS506-DS/midterm/data/X_test.csv")
# Select the columns to be included in the submission file
X_submission = X_submission[['Id', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Summary', 'Text']]

# Create a new column for the helpfulness ratio
X_submission['HelpfulnessRatio'] = X_submission['HelpfulnessNumerator'] / X_submission['HelpfulnessDenominator']

# Convert the Summary and Text columns to string data type
X_submission['Summary'] = X_submission['Summary'].astype(str)
X_submission['Text'] = X_submission['Text'].astype(str)

# Fit the vectorizer on the test data
new_X_text = vectorizer.fit_transform(X_submission[['Summary', 'Text']].apply(lambda x: ' '.join(x), axis=1)).toarray()

# Apply MaxAbsScaler to the new feature matrix
scaler_test = MaxAbsScaler()
new_X_text_scaled = scaler_test.fit_transform(new_X_text)

# Apply the same scaler to the test data
X_submission_ratio_scaled = scaler.transform(X_submission[['HelpfulnessRatio']].values)

# Concatenate the two feature matrices
new_X = np.concatenate((new_X_text_scaled, X_submission_ratio_scaled), axis=1)

# Make predictions on the test data
X_submission['Score'] = model.predict(new_X, predict_disable_shape_check=True)

# Select the columns to be included in the final submission file
submission = X_submission[['Id', 'Score']]

# Save the submission file
submission.to_csv("submission.csv", index=False)