In [10]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import warnings
warnings.filterwarnings('ignore')

# Pipeline for project 1

In [11]:
import pandas as pd
tr = pd.read_csv('./data/train.csv')
va = pd.read_csv('./data/valid.csv')
ts = pd.read_csv('./data/test.csv')
len(tr), len(va), len(ts)

(20000, 2000, 2000)

In [12]:
# add flag to identify where data is from
tr['category'] = 'train'
va['category'] = 'valid'
ts['category'] = 'test'
df = pd.concat([tr, va, ts])
df['category'].value_counts()

train    20000
test      2000
valid     2000
Name: category, dtype: int64

In [13]:
df.columns

Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id', 'category'],
      dtype='object')

## Feature engineering

### Polarity and subjectivity from TextBlob

In [14]:
from textblob import TextBlob

sub = []
pol = []
for txt in df['text']:
    blob = TextBlob(txt).sentiment
    sub.append(blob.subjectivity)
    pol.append(blob.polarity)

df['subjectivity'] = sub
df['polarity'] = pol

df.columns

Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id', 'category', 'subjectivity', 'polarity'],
      dtype='object')

### Postive, negative and objective score from SentiWorldNet

In [15]:
import re
import nltk
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import sentiwordnet as swn

def get_senti_word(s):
    lst = list(swn.senti_synsets(s))
    if len(lst) == 0:
        return np.array([None] * 3)
    else:
        pos = np.array([i.pos_score() for i in lst])
        neg = np.array([i.neg_score() for i in lst])
        obj = np.array([i.obj_score() for i in lst])
        return np.array([pos.mean(), neg.mean(), obj.mean()])

print(get_senti_word('life'))
print(get_senti_word('safsdfsdg'))

[0.01785714 0.         0.98214286]
[None None None]


In [16]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

def get_senti_txt(txt):
    try:
        txt = nltk.word_tokenize(txt)
#         ps = PorterStemmer()
#         txt = [ps.stem(token) for token in txt]
        txt = [re.sub(r'\W+', '', s) for s in txt]
        txt = [token for token in txt if token not in stopwords and not token.isnumeric()]

        txt_score = [get_senti_word(s) for s in txt]
        txt_pos = list(filter(None.__ne__, [s[0] for s in txt_score]))
        txt_neg = list(filter(None.__ne__, [s[1] for s in txt_score]))
        txt_obj = list(filter(None.__ne__, [s[2] for s in txt_score]))

        return np.mean(txt_pos), np.median(txt_pos), np.max(txt_pos), np.min(txt_pos), \
                np.mean(txt_neg), np.median(txt_neg), np.max(txt_neg), np.min(txt_neg), \
                np.mean(txt_obj), np.median(txt_obj), np.max(txt_obj), np.min(txt_obj)
    except:
        return np.array([None] * 12)

print(get_senti_txt('This is exciting!!!'))

(0.1875, 0.1875, 0.1875, 0.1875, 0.175, 0.175, 0.175, 0.175, 0.6375, 0.6375, 0.6375, 0.6375)


In [17]:
df['pos_mean'], df['pos_median'], df['pos_high'], df['pos_low'], \
df['neg_mean'], df['neg_median'], df['neg_high'], df['neg_low'], \
df['obj_mean'], df['obj_median'], df['obj_high'], df['obj_low'], \
= zip(*df['text'].map(get_senti_txt))

df.columns

Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id', 'category', 'subjectivity', 'polarity', 'pos_mean',
       'pos_median', 'pos_high', 'pos_low', 'neg_mean', 'neg_median',
       'neg_high', 'neg_low', 'obj_mean', 'obj_median', 'obj_high', 'obj_low'],
      dtype='object')

In [18]:
# Save the data, as run all above would take much time
# df.to_csv('./data_basic.csv')

## Modelling

In [19]:
# Gather feature together
features = ['cool', 'funny', 'useful'] + list(df.columns[10:])
print(features)

['cool', 'funny', 'useful', 'subjectivity', 'polarity', 'pos_mean', 'pos_median', 'pos_high', 'pos_low', 'neg_mean', 'neg_median', 'neg_high', 'neg_low', 'obj_mean', 'obj_median', 'obj_high', 'obj_low']


In [20]:
df.fillna(0)

X = df[df.category == 'train'][features].astype(float)
y = df[df.category == 'train']['stars']
X_val = df[df.category == 'valid'][features].astype(float)
y_val = df[df.category == 'valid']['stars']
X_test = df[df.category == 'test'][features].astype(float)

len(X), len(y), len(X_val), len(y_val), len(X_test)

(20000, 20000, 2000, 2000, 2000)

### LightGBM

In [21]:
# import lightgbm as lgb
# from tqdm import tqdm
# from sklearn.model_selection import ParameterGrid

# train = lgb.Dataset(data=X, label=y)

# # tune the parameter with grid search (with cross validation)
# param_grid = {
#     'max_depth': [5, 7, None],
#     'min_data_in_leaf': [1, 4, 8, 16],
#     'feature_fraction': [0.3, 0.5, 0.7]
# }

# cv_results = []

# for hyperparams in tqdm(list(ParameterGrid(param_grid))):
#     fixed = {
#         "objective": 'regression',
#         "learning_rate": 0.01
#     }
#     hyperparams.update(fixed)

#     validation_summary = lgb.cv(
#         hyperparams,
#         train,
#         num_boost_round=6000,
#         nfold=5,
#         metrics=["rmse"],
#         early_stopping_rounds=500,
#         verbose_eval=None)

#     optimal_num_trees = len(validation_summary["rmse-mean"])
#     hyperparams["num_boost_round"] = optimal_num_trees
#     cv_results.append((hyperparams, validation_summary["rmse-mean"][-1]))

# # save the tuning results
# tmp = pd.DataFrame(cv_results)
# tmp.to_csv('./lgb_cv_result.csv', index=False)

In [22]:
import ast
import lightgbm as lgb
tmp = pd.read_csv('./lgb_cv_result.csv')
param = ast.literal_eval(list(tmp.sort_values(by=['1'])['0'])[0])

In [24]:
# extract best parameter set with least rmse
# param = list(tmp.sort_values(by=[1])[0])[0]
train = lgb.Dataset(data=X, label=y)
gbm = lgb.train(
    param, 
    train_set=train, 
    num_boost_round=param['num_boost_round'])

# save feature importance to csv
# importance = pd.DataFrame([X.columns, gbm.feature_importance()]).T
# importance.columns = ['feature', 'value']
# importance = importance.sort_values(by=['value'], ascending=False)
# importance.to_csv('./lgb_importance.csv', index=False)

In [27]:
# if predicted stars are out of bound, cast to 1 or 5; round to nearest integer otherwise
def round_star(star):
    if star < 1:
        return 1
    elif star > 5:
        return 5
    else:
        return round(star)

pre = df[df.category == 'test'][['review_id']]
pre['predicted_star'] = [round_star(s) for s in gbm.predict(X_test)]
pre.to_csv('./lgb_predict.csv', index=False)

## Evaluation

In [28]:
# LightGBM
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

y_val_pre = [round_star(s) for s in gbm.predict(X_val)]
acc = accuracy_score(y_val, y_val_pre)
p, r, f1, _ = precision_recall_fscore_support(y_val, y_val_pre, average="macro")
print('LightGBM: ')
print("accuracy:", acc, "\tprecision:", p, "\trecall:", r, "\tf1:", f1)

LightGBM: 
accuracy: 0.4485 	precision: 0.4687899120153495 	recall: 0.40073731519981315 	f1: 0.39445434093244286


In [30]:
df.category.value_counts()

train    20000
test      2000
valid     2000
Name: category, dtype: int64

In [32]:
pre = df[df.category == 'test'][['review_id']]
pre['predicted_star'] = gbm.predict(X_test)
pre.to_csv('./data/lgb_pred.csv', index=False)

val = df[df.category == 'valid'][['review_id']]
val['predicted_star'] = gbm.predict(X_val)
val.to_csv('./data/lgb_val.csv', index=False)