# modules

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import re
from textblob import TextBlob

from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

from sklearn.feature_selection import SelectKBest, f_classif

from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import lightgbm as lgb
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.feature_selection import RFE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA

In [321]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PCPRODZ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PCPRODZ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PCPRODZ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# utils

In [345]:
def submit(predictions, filename):
    sub = {
        'ID': [],
        'Label': []
    }
    for i, p in enumerate(predictions):
        sub['ID'].append(i)
        sub['Label'].append('Y' if p==1 else 'N')   

    df = pd.DataFrame(sub)
    df.to_csv(f'./submission/{filename}.csv', index=False)

# data

In [7]:
df_train = pd.read_csv('./data/training_data.csv')
df_test = pd.read_csv('./data/testing_data.csv')

In [8]:
df_train['Label'].replace({'Y': 1, 'N': 0}, inplace=True)

## explore

In [337]:
df_train.head(1)

Unnamed: 0,id,Date,review ID,reviewer ID,product ID,rating_Helpful,rating_Thanks,rating_LoveThis,rating_OhNo,reviews,Label
0,0,5/17/2009,0dFa6egshOwhusL8aSMw-Q,8GC6cFcby0stKarnzL9i2w,dKcO9OQ44RPRlkWe-vToFA,0,0,0,4,Just got back from Shaw's. Great oysters. They...,1


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47176 entries, 0 to 47175
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               47176 non-null  int64 
 1   Date             47176 non-null  object
 2   review ID        47176 non-null  object
 3   reviewer ID      47176 non-null  object
 4   product ID       47176 non-null  object
 5   rating_Helpful   47176 non-null  int64 
 6   rating_Thanks    47176 non-null  int64 
 7   rating_LoveThis  47176 non-null  int64 
 8   rating_OhNo      47176 non-null  int64 
 9   reviews          47176 non-null  object
 10  Label            47176 non-null  object
dtypes: int64(5), object(6)
memory usage: 4.0+ MB


In [6]:
df_train.describe()

Unnamed: 0,id,rating_Helpful,rating_Thanks,rating_LoveThis,rating_OhNo
count,47176.0,47176.0,47176.0,47176.0,47176.0
mean,23587.5,0.45,0.53,0.84,3.93
std,13618.68,1.65,1.57,1.93,1.13
min,0.0,0.0,0.0,0.0,1.0
25%,11793.75,0.0,0.0,0.0,3.0
50%,23587.5,0.0,0.0,0.0,4.0
75%,35381.25,0.0,1.0,1.0,5.0
max,47175.0,78.0,68.0,78.0,5.0


## preprocessing

### general

In [348]:
train_reviewers = df_train['reviewer ID'].value_counts().index
test_reviewers = df_test['reviewer ID'].value_counts().index

not_present_in_train = 0
for t in test_reviewers:
    if t not in train_reviewers:
        not_present_in_train += 1
        
print(not_present_in_train)

8617


In [349]:
all_data = pd.concat([df_train, df_test])
reviewer_counts = all_data['reviewer ID'].value_counts()

# reviewer_counts = df_train['reviewer ID'].value_counts()

df_train['reviewer ID'] = df_train['reviewer ID'].map(reviewer_counts)
df_test['reviewer ID'] = df_test['reviewer ID'].map(reviewer_counts)
# df_test['reviewer ID'] = df_test['reviewer ID'].fillna(0)

In [350]:
percentage_y_per_product_train = df_train.groupby('product ID')['Label'].apply(lambda x: (x == 'Y').sum())

df_train['product ID'] = df_train['product ID'].map(percentage_y_per_product_train)
df_test['product ID'] = df_test['product ID'].map(percentage_y_per_product_train)

### time

In [342]:
df_train['Date'] = pd.to_datetime(df_train['Date'])
df_test['Date'] = pd.to_datetime(df_test['Date'])

def generate_date_features(df, date_column='Date'):

    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month
    df['day'] = df[date_column].dt.day
    df['weekday'] = df[date_column].dt.weekday  # Monday is 0 and Sunday is 6
    df['quarter'] = df[date_column].dt.quarter
    df['is_weekend'] = (df[date_column].dt.weekday // 5).astype(int)  # 1 if weekend, 0 if not

    # Cyclical features for circular patterns (e.g., day of week, month)
    df['weekday_sin'] = df[date_column].dt.dayofweek.apply(lambda x: np.sin(2 * np.pi * x / 7))
    df['weekday_cos'] = df[date_column].dt.dayofweek.apply(lambda x: np.cos(2 * np.pi * x / 7))
    df['month_sin'] = df[date_column].dt.month.apply(lambda x: np.sin(2 * np.pi * x / 12))
    df['month_cos'] = df[date_column].dt.month.apply(lambda x: np.cos(2 * np.pi * x / 12))

    return df

generate_date_features(df_train)
generate_date_features(df_test)

Unnamed: 0,id,Date,review ID,reviewer ID,product ID,rating_Helpful,rating_Thanks,rating_LoveThis,rating_OhNo,reviews,year,month,day,weekday,quarter,is_weekend,weekday_sin,weekday_cos,month_sin,month_cos
0,0,2007-11-25,EpUIAOmCal3KLpwfRPwaSw,5,0,0,0,0,4,"Great pizza, good location, and a parking lot....",2007,11,25,6,4,1,-0.78,0.62,-0.50,0.87
1,1,2009-03-16,WP8YNEOrIYkA-JD1pj4SoA,5,0,0,0,0,5,This is my favorite place in Chicago. The food...,2009,3,16,0,1,0,0.00,1.00,1.00,0.00
2,2,2009-11-09,fIklWlw56IGRosS,2,0,0,0,0,4,A few friends and I were visiting our other fr...,2009,11,9,0,4,0,0.00,1.00,-0.50,0.87
3,3,2010-05-12,7wVIW6OChqj4Y4y7OiuLVw,12,0,1,0,1,3,How do I put this politely without offending m...,2010,5,12,2,2,0,0.97,-0.22,0.50,-0.87
4,4,2012-08-26,GdMImdnQta4l3AkQILj2HA,1,0,0,0,0,3,"Traveling through Chicago for business, they b...",2012,8,26,6,3,1,-0.78,0.62,-0.87,-0.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20214,20214,2011-11-03,XjOY6cGX0oT7PI5YI8Nj7A,2,0,0,0,0,4,"First of all, you'll love the concept! Salad w...",2011,11,3,3,4,0,0.43,-0.90,-0.50,0.87
20215,20215,2009-06-12,72dCIymfps2-2Tqylc,8,0,0,1,1,2,I finally went to try Tank after seeing all th...,2009,6,12,4,2,0,-0.43,-0.90,0.00,-1.00
20216,20216,2009-12-07,liHyGWl-RjnWuIqYIGOocw,2,0,0,1,0,3,I would have to say that we went to Simply It ...,2009,12,7,0,4,0,0.00,1.00,-0.00,1.00
20217,20217,2009-08-12,yF3JrC073ch0mSdrbJOz0g,1,0,0,0,0,4,A little pricey but always delicious. If I liv...,2009,8,12,2,3,0,0.97,-0.22,-0.87,-0.50


In [343]:
mean_day_spam = df_train.groupby('day')['Label'].mean()
df_train['mean_daily_spam'] = df_train['day'].map(mean_day_spam)
df_test['mean_daily_spam'] = df_test['day'].map(mean_day_spam)

mean_month_spam = df_train.groupby('month')['Label'].mean()
df_train['mean_monthly_spam'] = df_train['month'].map(mean_month_spam)
df_test['mean_monthly_spam'] = df_test['month'].map(mean_month_spam)

mean_year_spam = df_train.groupby('year')['Label'].mean()
df_train['mean_yearly_spam'] = df_train['year'].map(mean_year_spam)
df_test['mean_yearly_spam'] = df_test['year'].map(mean_year_spam)

### language

In [330]:
def most_repeated_words(text_series, top_n=10):
    all_text = ' '.join(text_series)
    words = word_tokenize(all_text)
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    word_counts = Counter(words)
    most_common_words = word_counts.most_common(top_n)
    return most_common_words


In [353]:
def calculate_text_features(review):
    # Basic features
    word_count = len(review.split())
    char_count = len(review)
    avg_word_length = sum(len(word) for word in review.split()) / word_count

    # Case-related features
    uppercase_count = sum(1 for char in review if char.isupper())
    lowercase_count = sum(1 for char in review if char.islower())
    uppercase_lowercase_ratio = uppercase_count / (lowercase_count + 1)  # Adding 1 to avoid division by zero

    # Punctuation-related features
    punctuation_count = sum(1 for char in review if char in '.,;:!?')

    # Word-related features
    capitalized_word_count = sum(1 for word in review.split() if word.isupper())
    stop_words = set(stopwords.words('english'))
    stopword_count = sum(1 for word in review.split() if word.lower() in stop_words)
    unique_word_count = len(set(review.split()))
    repetition_ratio = (word_count - unique_word_count) / (word_count + 1)  # Adding 1 to avoid division by zero

    # Sentiment-related features
    sentiment = TextBlob(review).sentiment.polarity
    subjectivity = TextBlob(review).sentiment.subjectivity

    # Miscellaneous features
    exclamation_mark_count = review.count('!')
    question_mark_count = review.count('?')
    numeric_count = sum(1 for char in review if char.isnumeric())
    url_count = len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', review))
    
    features = {
        'word_count': word_count,
        'char_count': char_count,
        'avg_word_length': avg_word_length,
        'uppercase_lowercase_ratio': uppercase_lowercase_ratio,
        'punctuation_count': punctuation_count,
        'capitalized_word_count': capitalized_word_count,
        'stopword_count': stopword_count,
        'unique_word_count': unique_word_count,
        'repetition_ratio': repetition_ratio,
        'sentiment': sentiment,
        'subjectivity': subjectivity,
        'exclamation_mark_count': exclamation_mark_count,
        'question_mark_count': question_mark_count,
        'numeric_count': numeric_count,
        'url_count': url_count,
    }

    return features

In [354]:
df_train['text_features'] = df_train['reviews'].apply(calculate_text_features)
df_train = pd.concat([df_train, pd.DataFrame(df_train['text_features'].to_list())], axis=1)
df_train = df_train.drop('text_features', axis=1)

df_test['text_features'] = df_test['reviews'].apply(calculate_text_features)
df_test = pd.concat([df_test, pd.DataFrame(df_test['text_features'].to_list())], axis=1)
df_test = df_test.drop('text_features', axis=1)

# X, y

## Feature selection

In [355]:
text_features = [
    'word_count',
    'char_count',
    'avg_word_length',
    'uppercase_lowercase_ratio',
    'punctuation_count',
    'capitalized_word_count',
    'stopword_count',
    'unique_word_count',
    'repetition_ratio',
    'sentiment',
    'subjectivity',
    'exclamation_mark_count',
    'question_mark_count',
    'numeric_count',
    'url_count',
]

# time_features = [
#     # 'year', 'month', 'day', 'weekday', 'quarter', 'is_weekend', 'weekday_sin', 'weekday_cos', 'month_sin', 'month_cos',
#     'mean_daily_spam', 'mean_monthly_spam', 'mean_yearly_spam'             
# ]

features = ['product ID', 'reviewer ID', 'rating_Helpful', 'rating_Thanks', 'rating_LoveThis', 'rating_OhNo']
# features = ['reviewer ID', 'rating_Helpful', 'rating_Thanks', 'rating_LoveThis', 'rating_OhNo']
            
# X = df_train[features + time_features + text_features]
X = df_train[features + text_features]
# X_test = df_test[features + time_features + text_features]
X_test = df_test[features + text_features]

y = df_train['Label']

### mutual info

In [293]:
mutual_info = mutual_info_classif(X, y)

feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mutual_info})
feature_importance_df = feature_importance_df.sort_values(by='Mutual Information', ascending=False)

print(feature_importance_df)

           Feature  Mutual Information
2  rating_LoveThis                0.03
3      rating_OhNo                0.02
0   rating_Helpful                0.02
1    rating_Thanks                0.01


In [153]:
selected_features = feature_importance_df[feature_importance_df['Mutual Information'] > 0.01]['Feature'].values
selected_features

array(['rating_LoveThis', 'reviewer ID', 'rating_Thanks',
       'rating_Helpful', 'unique_word_count', 'rating_OhNo', 'char_count',
       'sentiment', 'punctuation_count'], dtype=object)

In [154]:
X = X[selected_features]
X_test = X_test[selected_features]

### feature importance

In [356]:
random_forest_model = RandomForestClassifier(random_state=42)

random_forest_model.fit(X, y)

feature_importances = random_forest_model.feature_importances_

print("Feature Importances:")
for feature, importance in zip(X.columns, feature_importances):
    print(f"{feature}: {importance:.4f}")

Feature Importances:
product ID: 0.0000
reviewer ID: 0.0882
rating_Helpful: 0.0186
rating_Thanks: 0.0248
rating_LoveThis: 0.0660
rating_OhNo: 0.0412
word_count: 0.0568
char_count: 0.0707
avg_word_length: 0.0776
uppercase_lowercase_ratio: 0.0752
punctuation_count: 0.0504
capitalized_word_count: 0.0338
stopword_count: 0.0556
unique_word_count: 0.0577
repetition_ratio: 0.0655
sentiment: 0.0790
subjectivity: 0.0748
exclamation_mark_count: 0.0291
question_mark_count: 0.0094
numeric_count: 0.0247
url_count: 0.0009


### scaling

In [49]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

### pca

In [57]:
pca = PCA(n_components=0.95)

X = pca.fit_transform(X)
X_test = pca.transform(X_test)

### Kbest features

In [62]:
selector = SelectKBest(score_func=f_classif, k=2)
X_pd = pd.DataFrame(X)
X = selector.fit_transform(X_pd, y)

selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = X_pd.columns[selected_feature_indices]

print("Selected Feature Names:", selected_feature_names)

Selected Feature Names: Index(['reviewer ID', 'rating_Helpful', 'rating_Thanks', 'rating_LoveThis',
       'word_count', 'char_count', 'punctuation_count', 'stopword_count',
       'unique_word_count', 'repetition_ratio'],
      dtype='object')


### RFE

In [294]:
model = RandomForestClassifier()

rfe = RFE(model, n_features_to_select=10)
X_selected = rfe.fit_transform(X, y)

# Training

In [315]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Model selection

In [195]:
clf = LazyClassifier(verbose=0, ignore_warnings=True)

# models, predictions = clf.fit(X, y)
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

accuracy = models.Accuracy
best_models = accuracy[accuracy > 0.85]
best_models.sort_values(ascending=False)

100%|██████████| 29/29 [04:40<00:00,  9.66s/it]


Model
RidgeClassifier              0.87
RidgeClassifierCV            0.87
LinearSVC                    0.87
SGDClassifier                0.87
DummyClassifier              0.87
SVC                          0.87
AdaBoostClassifier           0.87
LGBMClassifier               0.87
RandomForestClassifier       0.87
LogisticRegression           0.87
LinearDiscriminantAnalysis   0.87
CalibratedClassifierCV       0.87
ExtraTreesClassifier         0.87
XGBClassifier                0.86
BaggingClassifier            0.86
KNeighborsClassifier         0.85
Name: Accuracy, dtype: float64

## Individual Tuning

### XGB

In [316]:
xgb_model = XGBClassifier()

scores = cross_val_score(xgb_model, X, y, cv=5, scoring='accuracy')

print("Cross-Validation Scores:")
for fold, score in enumerate(scores, 1):
    print(f"Fold {fold}: {score:.4f}")

mean_accuracy = scores.mean()
print(f"Mean Accuracy: {mean_accuracy:.4f}")

Cross-Validation Scores:
Fold 1: 0.8764
Fold 2: 0.8789
Fold 3: 0.8751
Fold 4: 0.8746
Fold 5: 0.8733
Mean Accuracy: 0.8757


In [319]:
xgb_model = XGBClassifier()

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 10]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Hyperparameters: ", grid_search.best_params_)

best_model = grid_search.best_estimator_
val_accuracy = best_model.score(X_val, y_val)
print("Val Accuracy: {:.4f}%".format(val_accuracy * 100))

Best Hyperparameters:  {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 300}
Val Accuracy: 87.5689%


In [226]:
predictions = best_model.predict(X_test)

In [227]:
pd.Series(predictions).value_counts()

0    19774
1      445
dtype: int64

In [241]:
thresholds = [0.67, 0.68, 0.69, 0.7]

for threshold in thresholds:
    probas = best_model.predict_proba(X_test)[:, 1]  # Assuming you want probabilities for the positive class
    predictions = [1 if p > threshold else 0 for p in probas]
    
    print("Threshold:", threshold)
    print("Predictions mean:", sum(predictions) / len(predictions))

Threshold: 0.67
Predictions mean: 0.021761709283347346
Threshold: 0.68
Predictions mean: 0.021761709283347346
Threshold: 0.69
Predictions mean: 0.0
Threshold: 0.7
Predictions mean: 0.0


In [235]:
predictions = [1 if p > 0.38 else 0 for p in best_model.predict_proba(X_test)[:, 1]]

In [237]:
submit(predictions, 'xgb_0.38')

### LGBM

In [196]:
lgb_model = lgb.LGBMClassifier()

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [None, 5, 10]
}

grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

best_model = grid_search.best_estimator_
val_accuracy = best_model.score(X, y)
print("Val Accuracy: {:.2f}%".format(val_accuracy * 100))

Best Hyperparameters:  {'learning_rate': 0.01, 'max_depth': None, 'n_estimators': 100}
Best Accuracy: 86.77%
Val Accuracy: 86.77%


### RF

In [132]:
rf_model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [5, 10],
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

best_model = grid_search.best_estimator_
val_accuracy = best_model.score(X, y)
print("Val Accuracy: {:.2f}%".format(val_accuracy * 100))

Best Hyperparameters:  {'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100}
Best Accuracy: 87.57%
Val Accuracy: 87.57%


### Adaboost

In [133]:
ada_model = AdaBoostClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1]
}

grid_search = GridSearchCV(estimator=ada_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

best_model = grid_search.best_estimator_
val_accuracy = best_model.score(X, y)
print("Val Accuracy: {:.2f}%".format(val_accuracy * 100))

Best Hyperparameters:  {'learning_rate': 0.05, 'n_estimators': 100}
Best Accuracy: 87.55%
Val Accuracy: 87.55%


In [136]:
y.value_counts() / y.shape[0]

0   0.87
1   0.13
Name: Label, dtype: float64

## Ensemble

### Voting

In [54]:
lgbm_classifier = LGBMClassifier(learning_rate=0.05, max_depth= 5, n_estimators= 200, random_state=42)
rf_classifier = RandomForestClassifier(max_depth=None, min_samples_leaf=5, min_samples_split=5, n_estimators=200, random_state=42)
ada_classifier = AdaBoostClassifier(learning_rate=0.1, n_estimators=200, random_state=42)
xgb_classifier = XGBClassifier(random_state=42)
cat_classifier = CatBoostClassifier(random_state=42, verbose=0)
svc_classifier = SVC(probability=True, random_state=42)
logreg_classifier = LogisticRegression(random_state=42)

voting_classifier = VotingClassifier(
    estimators=[
        ('lgbm', lgbm_classifier),
        ('rf', rf_classifier),
        ('ada', ada_classifier),
        # ('xgb', xgb_classifier),
        # ('cat', cat_classifier),
        # ('svc', svc_classifier),
        # ('logreg', logreg_classifier)
    ],
    voting='hard'
)

In [55]:
cross_val_results = cross_val_score(voting_classifier, X, y, cv=5, scoring='accuracy')
print("Cross Validation Accuracies: ", cross_val_results)
print("Cross Validation Accuracy: {:.2f}%".format(cross_val_results.mean() * 100))

Cross Validation Accuracies:  [0.87749046 0.8808691  0.87673556 0.87705352 0.87525172]
Cross Validation Accuracy: 87.75%


In [43]:
voting_classifier.fit(X_train, y_train)

predictions = voting_classifier.predict(X_val)
accuracy = accuracy_score(y_val, predictions)
print("Val Accuracy: {:.2f}%".format(accuracy * 100))

Val Accuracy: 87.59%


In [46]:
voting_classifier.fit(X, y)

In [48]:
predictions = voting_classifier.predict(X_test)

sub = {
    'ID': [],
    'Label': []
}

for i, p in enumerate(predictions):
  sub['ID'].append(i)
  sub['Label'].append('Y' if p==1 else 'N')

df = pd.DataFrame(sub)
df.to_csv('./submission/voting.csv', index=False)

### stacking

In [137]:
lgbm_classifier = LGBMClassifier(learning_rate=0.05, max_depth=5, n_estimators=200, random_state=42)
rf_classifier = RandomForestClassifier(max_depth=None, min_samples_leaf=5, min_samples_split=5, n_estimators=200, random_state=42)
ada_classifier = AdaBoostClassifier(learning_rate=0.1, n_estimators=200, random_state=42)

meta_classifier = LogisticRegression(random_state=42)

stacking_classifier = StackingClassifier(
    estimators=[
        ('lgbm', lgbm_classifier),
        ('rf', rf_classifier),
        ('ada', ada_classifier)
    ],
    final_estimator=meta_classifier,
    cv=5
)

stacking_classifier.fit(X, y)

stacking_predictions = stacking_classifier.predict(X)
accuracy = accuracy_score(y, stacking_predictions)
print(f"Stacking Model Accuracy: {accuracy:.3f}")

Stacking Model Accuracy: 0.876


In [74]:
predictions = stacking_classifier.predict(X_test)

## NNs

In [101]:
model = Sequential([
    Dense(128, input_shape=(X.shape[1],), activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])


In [102]:
model.compile(optimizer=Adam(learning_rate=1e-2), loss='binary_crossentropy', metrics=['accuracy'])

In [103]:
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ec711b2940>

## LSTMs

In [9]:
X_text = df_train[['reviews']]
X_text_test = df_test[['reviews']]

In [None]:
y_text = df_train['Label']
y_text.replace({'Y': 1, 'N':0})

In [21]:
df_train.shape, y_text.shape

((47176, 11), (47176,))

### preprocessing

In [10]:
X_text['reviews'] = X_text['reviews'].str.lower()
X_text_test['reviews'] = X_text_test['reviews'].str.lower()

In [11]:
X_text['reviews'] = X_text['reviews'].str.replace('[^\w\s]', '')
X_text_test['reviews'] = X_text_test['reviews'].str.replace('[^\w\s]', '')

In [12]:
X_text['reviews'] = X_text['reviews'].apply(word_tokenize)
X_text_test['reviews'] = X_text_test['reviews'].apply(word_tokenize)

In [13]:
stop_words = set(stopwords.words('english'))

X_text['reviews'] = X_text['reviews'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
X_text_test['reviews'] = X_text_test['reviews'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [None]:
stemmer = PorterStemmer()

X_text['reviews'] = X_text['reviews'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])
X_text_test['reviews'] = X_text_test['reviews'].apply(lambda tokens: [stemmer.stem(word) for word in tokens])

In [84]:
lemmatizer = WordNetLemmatizer()

X_text['reviews'] = X_text['reviews'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
X_text_test['reviews'] = X_text_test['reviews'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

In [14]:
X_text['reviews'] = X_text['reviews'].apply(lambda tokens: ' '.join(tokens))
X_text_test['reviews'] = X_text_test['reviews'].apply(lambda tokens: ' '.join(tokens))

In [24]:
df_clean = pd.concat([X_text, y_text], axis = 1)
df_clean.to_csv('clean.csv', index=False)

In [25]:
X_text_test.to_csv('clean_test.csv', index=False)

### training

In [61]:
all_text = pd.concat([X_text['reviews'], X_text_test['reviews']])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_text)

num_unique_words = len(tokenizer.word_index)
print("Number of Unique Words:", num_unique_words)

Number of Unique Words: 101201


In [62]:
max_words = num_unique_words

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(all_text)

X_train_sequences = tokenizer.texts_to_sequences(X_text['reviews'])
X_test_sequences = tokenizer.texts_to_sequences(X_text_test['reviews'])

In [63]:
max_len = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post')

In [64]:
embedding_dim = 64

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(32, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(1, activation='sigmoid')
])

In [65]:
model.compile(optimizer=Adam(learning_rate=5e-2), loss='binary_crossentropy', metrics=['accuracy'])

In [66]:
model.fit(X_train_padded, y_text, epochs=2, batch_size=128)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7f9f84356260>

In [67]:
preds = model.predict(X_test_padded)



In [68]:
predictions = ['Y' if x[0]>0.5 else 'N' for x in preds]

In [69]:
pd.Series(predictions).value_counts()

N    20219
dtype: int64

In [60]:
sub = {
    'ID': [],
    'Label': []
}

for i, p in enumerate(predictions):
  sub['ID'].append(i)
  sub['Label'].append(p)

df = pd.DataFrame(sub)
df.to_csv('./lstm.csv', index=False)