In [2]:
%pip install -r requirements.txt

Collecting pandas==1.4.3
  Downloading pandas-1.4.3-cp39-cp39-macosx_11_0_arm64.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 3.9 MB/s eta 0:00:01
[?25hCollecting numpy==1.22.4
  Using cached numpy-1.22.4-cp39-cp39-macosx_11_0_arm64.whl (12.8 MB)
Collecting scipy
  Downloading scipy-1.9.3-cp39-cp39-macosx_12_0_arm64.whl (28.6 MB)
[K     |████████████████████████████████| 28.6 MB 3.3 MB/s eta 0:00:01
[?25hCollecting nltk==3.7
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting scikit-learn
  Downloading scikit_learn-1.1.3-cp39-cp39-macosx_12_0_arm64.whl (7.7 MB)
[K     |████████████████████████████████| 7.7 MB 3.8 MB/s eta 0:00:01
[?25hCollecting gensim==4.2.0
  Using cached gensim-4.2.0-cp39-cp39-macosx_11_0_arm64.whl
Collecting pytz>=2020.1
  Downloading pytz-2022.6-py2.py3-none-any.whl (498 kB)
[K     |████████████████████████████████| 498 kB 3.0 MB/s eta 0:00:01
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[K     |████████

In [48]:
import pandas as pd
import numpy as np

import os
import time
import pickle

from scipy.sparse import vstack
from nltk import word_tokenize

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit

from gensim.models import FastText

## Data Overview

In [4]:
df_train = pd.read_csv('dataset/train_preprocess.csv')

In [5]:
df_train.head()

Unnamed: 0,review,ac,air_panas,bau,general,kebersihan,linen,service,sunrise_meal,tv,wifi
0,kebersihan kurang...,neut,neut,neut,neut,neg,neut,neut,neut,neut,neut
1,"sangat mengecewakan... hotel bad image, kebers...",neut,neut,neut,neut,neg,neut,neut,neut,neut,neut
2,Tempat nyaman bersih tapi tv terlalu tinggi ti...,neut,neut,neut,neut,pos,neut,neut,neut,neg,neut
3,"semuanya bagus sesuai profile,dan harga promo ...",neut,neg,neut,pos,neut,neut,neut,neut,neut,neut
4,"Tempat tidur sangat keras, bantal besar dan ke...",neg,neg,neut,neut,neut,neg,neut,neut,neut,neut


In [6]:
colums = df_train.columns.to_list()
colums.remove('review')

In [7]:
colums

['ac',
 'air_panas',
 'bau',
 'general',
 'kebersihan',
 'linen',
 'service',
 'sunrise_meal',
 'tv',
 'wifi']

In [8]:
for col in colums:
    count = df_train[col].value_counts()

    if not "neg_pos" in count:
        count['neg_pos'] = 0
        
    print("Columns: {}, neutral: {}, positive: {}, negative: {}, negative_positive: {}, total: {}".format(col, count['neut'], count['pos'], count['neg'], count['neg_pos'], count.sum()))

Columns: ac, neutral: 1814, positive: 51, negative: 417, negative_positive: 1, total: 2283
Columns: air_panas, neutral: 1922, positive: 26, negative: 335, negative_positive: 0, total: 2283
Columns: bau, neutral: 1911, positive: 12, negative: 360, negative_positive: 0, total: 2283
Columns: general, neutral: 2023, positive: 230, negative: 30, negative_positive: 0, total: 2283
Columns: kebersihan, neutral: 1350, positive: 205, negative: 722, negative_positive: 6, total: 2283
Columns: linen, neutral: 1613, positive: 63, negative: 606, negative_positive: 1, total: 2283
Columns: service, neutral: 1649, positive: 247, negative: 386, negative_positive: 1, total: 2283
Columns: sunrise_meal, neutral: 2108, positive: 75, negative: 100, negative_positive: 0, total: 2283
Columns: tv, neutral: 2075, positive: 13, negative: 195, negative_positive: 0, total: 2283
Columns: wifi, neutral: 1928, positive: 25, negative: 330, negative_positive: 0, total: 2283


## Def Function

In [9]:
def train_fasttext(xtrain, save_path='fasttext.ft'):
    sentences = [word_tokenize(content.lower()) for content in xtrain]
    vectorizer = FastText(sentences, vector_size=300, window=3, min_count=1, workers=4, epochs=1000, sg=0, hs=0)
    vectorizer.save(save_path)
    print('fasttext model saved at '+save_path)

def norm_sent_vector(sentence, wv):
    vecs = [wv[word.lower()] for word in word_tokenize(sentence)] 
    norm_vecs = [vec / np.linalg.norm(vecs) for vec in vecs if np.linalg.norm(vecs) > 0]
    sent_vec = np.mean(norm_vecs, axis=0)
    return sent_vec

def hyperparam_tuning(xtrain, ytrain, xvalid, yvalid, classifier, param_grid):
    # combine train and valid
    x = vstack([xtrain, xvalid])
    y = ytrain + yvalid
    
    # create predefined split
    # -1 for all training and 0 for all validation
    ps = PredefinedSplit([-1] * len(ytrain) + [0] * len(yvalid))
    clf = GridSearchCV(classifier, param_grid, cv = ps)
    clf = clf.fit(x, y)
    return clf

def train_and_test(data_train, data_valid, data_test, feature="bow", classifier="nb", save_path=None, ft_path="fasttext.ft"):
    xtrain = data_train['review']
    xvalid = data_valid['review']
    xtest = data_test['review']

    colums = data_train.columns.to_list()
    colums.remove('review')

    if feature == "bow":
        vectorizer = CountVectorizer()
    elif feature == "tfidf":
        vectorizer = TfidfVectorizer()
    elif feature == "fasttext":
        vectorizer = FastText.load(ft_path).wv
    else:
        raise Exception('Feature unknown. Use "bow" or "tfidf" or "fasttext"')

    # transform
    if feature == "bow" or feature == "tfidf":
        vectorizer.fit(xtrain)
        xtrain = vectorizer.transform(xtrain)
        xvalid = vectorizer.transform(xvalid)
        xtest = vectorizer.transform(xtest)
    elif feature == "fasttext":
        scaler = MinMaxScaler()
        xtrain = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xtrain])
        xvalid = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xvalid])
        xtest = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xtest])

    # all classifiers
    classifier_model = {"nb" : MultinomialNB(),
                        "svm": SVC(),
                        "lr" : LogisticRegression(),
                    }
    # all params for grid-search
    param_grids = {"nb" : {"alpha": np.linspace(0.001,1,50)},
                "svm": {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']},
                "lr" : {'C': np.linspace(0.001,10,100)},
                }

    categorical = {}
    average_acc = 0
    for col in colums:
        ytrain = list(data_train[col])
        yvalid = list(data_valid[col])
        ytest = list(data_test[col])

        clf = hyperparam_tuning(xtrain, ytrain, xvalid, yvalid,
                                classifier=classifier_model[classifier],
                                param_grid=param_grids[classifier])

        if feature == "bow" or feature == "tfidf":
            pred = clf.predict(xtest.toarray())
        else:
            pred = clf.predict(xtest)

        f1 = f1_score(ytest, pred, average='macro')
        acc = accuracy_score(ytest, pred) 
        average_acc += acc

        categorical[col] = {'f1': f1, 'acc': acc}
        
        if save_path is not None:
            filename = save_path+'/'+feature+'/'+col
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename, 'wb') as fout:
                pickle.dump((vectorizer, clf), fout)

    average_acc = average_acc / len(colums)
    return average_acc, categorical

def predict(text, model_path, feature='bow'):

    colums = ['ac', 'air_panas', 'bau', 'general', 'kebersihan', 'linen', 'service', 'sunrise_meal', 'tv', 'wifi']
    pred = {}
    for col in colums:
        with open(model_path+'/'+feature+'/'+col, 'rb') as f:
            vectorizer, clf = pickle.load(f)

            if feature == "bow" or feature == "tfidf":
                x = vectorizer.transform([text])
                pred[col] = clf.predict(x.toarray())[0]
            elif feature == "fasttext":
                scaler = MinMaxScaler()
                x = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in [text]])
                pred[col] = clf.predict(x)[0]

    return pred

In [10]:
data_train = pd.read_csv("dataset/train_preprocess.csv")
data_valid = pd.read_csv("dataset/valid_preprocess.csv")
data_test = pd.read_csv("dataset/test_preprocess.csv")

### Train FastText Model for Vectorizer

In [11]:
train_fasttext(data_train['review'], save_path='fasttext.ft')

fasttext model saved at fasttext.ft


### Train, Test and Save Classifier for FastText Vectorizer

In [49]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="fasttext", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.8076923076923077
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.3560398916848293             0.8286713286713286            
air_panas       0.3075957313245449             0.8566433566433567            
bau             0.3069182389937107             0.8531468531468531            
general         0.3062381852551985             0.8496503496503497            
kebersihan      0.31546001153117537            0.5804195804195804            
linen           0.27191166321601107            0.6888111888111889            
service         0.27991886409736305            0.7237762237762237            
sunrise_meal    0.32                           0.9230769230769231            
tv              0.31553100061387357            0.8986013986013986            
wifi            0.3109452736318408             0.8741258741258742            


Predict 

In [54]:
text = "lumayan nyaman,tp kebersihan kmr mandi perlu ditingkatkan lg biar gk ada kuning2 di sudutnya lbh bgs"

start_time = time.time()

pred = predict(text, model_path="model/train1", feature="fasttext")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 24.70976185798645 seconds ---
ac              0.9545454545454546            
air_panas       0.9545454545454546            
bau             0.9545454545454546            
general         0.9545454545454546            
kebersihan      0.9545454545454546            
linen           0.9545454545454546            
service         0.9545454545454546            
sunrise_meal    0.9545454545454546            
tv              0.9545454545454546            
wifi            0.9545454545454546            


### Train, Test and Save Classifier for TFIDF Vectorizer

In [50]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="tfidf", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.8692307692307694
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.5065623023402909             0.8776223776223776            
air_panas       0.47470777135517017            0.8986013986013986            
bau             0.45163492388558946            0.8811188811188811            
general         0.35138188771075957            0.8496503496503497            
kebersihan      0.5238118995748172             0.7552447552447552            
linen           0.5363641662295273             0.8426573426573427            
service         0.6226459819873824             0.8181818181818182            
sunrise_meal    0.3650273224043716             0.9265734265734266            
tv              0.48686679174484054            0.9265734265734266            
wifi            0.49487841113456876            0.916083916083916             


Predict

In [51]:
start_time = time.time()

pred = predict(text, model_path="model/train1", feature="tfidf")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 0.023123979568481445 seconds ---
ac              0.916083916083916             
air_panas       0.916083916083916             
bau             0.916083916083916             
general         0.916083916083916             
kebersihan      0.916083916083916             
linen           0.916083916083916             
service         0.916083916083916             
sunrise_meal    0.916083916083916             
tv              0.916083916083916             
wifi            0.916083916083916             


### Train, Test and Save Classifier for BoW Vectorizer

In [52]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="bow", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.9031468531468532
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.6163453577764636             0.9440559440559441            
air_panas       0.5462501384734685             0.9265734265734266            
bau             0.5509052351157614             0.916083916083916             
general         0.47007575757575754            0.8846153846153846            
kebersihan      0.7713700755650752             0.8321678321678322            
linen           0.6464597075510863             0.8776223776223776            
service         0.7155265840442085             0.8496503496503497            
sunrise_meal    0.3887486243283485             0.9125874125874126            
tv              0.5333910533910534             0.9335664335664335            
wifi            0.5976338028169015             0.9545454545454546            


Predict

In [53]:
start_time = time.time()

pred = predict(text, model_path="model/train1", feature="bow")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 0.013634920120239258 seconds ---
ac              0.9545454545454546            
air_panas       0.9545454545454546            
bau             0.9545454545454546            
general         0.9545454545454546            
kebersihan      0.9545454545454546            
linen           0.9545454545454546            
service         0.9545454545454546            
sunrise_meal    0.9545454545454546            
tv              0.9545454545454546            
wifi            0.9545454545454546            
