In [1]:
%pip install -r requirements.txt

You should consider upgrading via the '/Users/fadhil/Desktop/Project/absa/.absa-airy/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

import os
import time
import pickle

from scipy.sparse import vstack
from nltk import word_tokenize

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit

from gensim.models import FastText

## Data Overview

In [3]:
df_train = pd.read_csv('dataset/train_preprocess.csv')

In [4]:
df_train.head()

Unnamed: 0,review,ac,air_panas,bau,general,kebersihan,linen,service,sunrise_meal,tv,wifi
0,kebersihan kurang...,neut,neut,neut,neut,neg,neut,neut,neut,neut,neut
1,"sangat mengecewakan... hotel bad image, kebers...",neut,neut,neut,neut,neg,neut,neut,neut,neut,neut
2,Tempat nyaman bersih tapi tv terlalu tinggi ti...,neut,neut,neut,neut,pos,neut,neut,neut,neg,neut
3,"semuanya bagus sesuai profile,dan harga promo ...",neut,neg,neut,pos,neut,neut,neut,neut,neut,neut
4,"Tempat tidur sangat keras, bantal besar dan ke...",neg,neg,neut,neut,neut,neg,neut,neut,neut,neut


In [5]:
colums = df_train.columns.to_list()
colums.remove('review')

In [6]:
colums

['ac',
 'air_panas',
 'bau',
 'general',
 'kebersihan',
 'linen',
 'service',
 'sunrise_meal',
 'tv',
 'wifi']

In [7]:
for col in colums:
    count = df_train[col].value_counts()

    if not "neg_pos" in count:
        count['neg_pos'] = 0
        
    print("Columns: {}, neutral: {}, positive: {}, negative: {}, negative_positive: {}, total: {}".format(col, count['neut'], count['pos'], count['neg'], count['neg_pos'], count.sum()))

Columns: ac, neutral: 1814, positive: 51, negative: 417, negative_positive: 1, total: 2283
Columns: air_panas, neutral: 1922, positive: 26, negative: 335, negative_positive: 0, total: 2283
Columns: bau, neutral: 1911, positive: 12, negative: 360, negative_positive: 0, total: 2283
Columns: general, neutral: 2023, positive: 230, negative: 30, negative_positive: 0, total: 2283
Columns: kebersihan, neutral: 1350, positive: 205, negative: 722, negative_positive: 6, total: 2283
Columns: linen, neutral: 1613, positive: 63, negative: 606, negative_positive: 1, total: 2283
Columns: service, neutral: 1649, positive: 247, negative: 386, negative_positive: 1, total: 2283
Columns: sunrise_meal, neutral: 2108, positive: 75, negative: 100, negative_positive: 0, total: 2283
Columns: tv, neutral: 2075, positive: 13, negative: 195, negative_positive: 0, total: 2283
Columns: wifi, neutral: 1928, positive: 25, negative: 330, negative_positive: 0, total: 2283


### Def Function

In [8]:
def train_fasttext(xtrain, save_path='fasttext.ft'):
    sentences = [word_tokenize(content.lower()) for content in xtrain]
    vectorizer = FastText(sentences, vector_size=300, window=3, min_count=1, workers=4, epochs=1000, sg=0, hs=0)
    vectorizer.save(save_path)
    print('fasttext model saved at '+save_path)

def norm_sent_vector(sentence, wv):
    vecs = [wv[word.lower()] for word in word_tokenize(sentence)] 
    norm_vecs = [vec / np.linalg.norm(vecs) for vec in vecs if np.linalg.norm(vecs) > 0]
    sent_vec = np.mean(norm_vecs, axis=0)
    return sent_vec

def hyperparam_tuning(xtrain, ytrain, xvalid, yvalid, classifier, param_grid):
    # combine train and valid
    x = vstack([xtrain, xvalid])
    y = ytrain + yvalid
    
    # create predefined split
    # -1 for all training and 0 for all validation
    ps = PredefinedSplit([-1] * len(ytrain) + [0] * len(yvalid))
    clf = GridSearchCV(classifier, param_grid, cv = ps)
    clf = clf.fit(x, y)
    return clf

def train_and_test(data_train, data_valid, data_test, feature="bow", classifier="nb", save_path=None, ft_path="fasttext.ft"):
    xtrain = data_train['review']
    xvalid = data_valid['review']
    xtest = data_test['review']

    colums = data_train.columns.to_list()
    colums.remove('review')

    if feature == "bow":
        vectorizer = CountVectorizer()
    elif feature == "tfidf":
        vectorizer = TfidfVectorizer()
    elif feature == "fasttext":
        vectorizer = FastText.load(ft_path).wv
    else:
        raise Exception('Feature unknown. Use "bow" or "tfidf" or "fasttext"')

    # transform
    if feature == "bow" or feature == "tfidf":
        vectorizer.fit(xtrain)
        xtrain = vectorizer.transform(xtrain)
        xvalid = vectorizer.transform(xvalid)
        xtest = vectorizer.transform(xtest)
    elif feature == "fasttext":
        scaler = MinMaxScaler()
        xtrain = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xtrain])
        xvalid = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xvalid])
        xtest = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xtest])

    # all classifiers
    classifier_model = {"nb" : MultinomialNB(),
                        "svm": SVC(),
                        "lr" : LogisticRegression(),
                    }
    # all params for grid-search
    param_grids = {"nb" : {"alpha": np.linspace(0.001,1,50)},
                "svm": {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']},
                "lr" : {'C': np.linspace(0.001,10,100)},
                }

    categorical = {}
    average_acc = 0
    for col in colums:
        ytrain = list(data_train[col])
        yvalid = list(data_valid[col])
        ytest = list(data_test[col])

        clf = hyperparam_tuning(xtrain, ytrain, xvalid, yvalid,
                                classifier=classifier_model[classifier],
                                param_grid=param_grids[classifier])

        if feature == "bow" or feature == "tfidf":
            pred = clf.predict(xtest.toarray())
        else:
            pred = clf.predict(xtest)

        f1 = f1_score(ytest, pred, average='macro')
        acc = accuracy_score(ytest, pred) 
        average_acc += acc

        categorical[col] = {'f1': f1, 'acc': acc}
        
        if save_path is not None:
            filename = save_path+'/'+feature+'/'+classifier+'/'+col
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename, 'wb') as fout:
                pickle.dump((vectorizer, clf), fout)

    average_acc = average_acc / len(colums)
    return average_acc, categorical

def predict(text, model_path, feature='bow', classifier='nb'):

    colums = ['ac', 'air_panas', 'bau', 'general', 'kebersihan', 'linen', 'service', 'sunrise_meal', 'tv', 'wifi']
    pred = {}
    for col in colums:
        with open(model_path+'/'+feature+'/'+classifier+'/'+col, 'rb') as f:
            vectorizer, clf = pickle.load(f)

            if feature == "bow" or feature == "tfidf":
                x = vectorizer.transform([text])
                pred[col] = clf.predict(x.toarray())[0]
            elif feature == "fasttext":
                scaler = MinMaxScaler()
                x = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in [text]])
                pred[col] = clf.predict(x)[0]

    return pred

In [9]:
data_train = pd.read_csv("dataset/train_preprocess.csv")
data_valid = pd.read_csv("dataset/valid_preprocess.csv")
data_test = pd.read_csv("dataset/test_preprocess.csv")

### Train FastText Model for Vectorizer

In [10]:
train_fasttext(data_train['review'], save_path='fasttext.ft')

fasttext model saved at fasttext.ft


### Train, Test and Save Classifier for FastText Vectorizer

#### Naive Bayes Classifier

In [11]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="fasttext", classifier="nb", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.8073426573426573
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.3560398916848293             0.8286713286713286            
air_panas       0.3075957313245449             0.8566433566433567            
bau             0.3069182389937107             0.8531468531468531            
general         0.3062381852551985             0.8496503496503497            
kebersihan      0.3009145930451194             0.5769230769230769            
linen           0.27191166321601107            0.6888111888111889            
service         0.27991886409736305            0.7237762237762237            
sunrise_meal    0.32                           0.9230769230769231            
tv              0.31553100061387357            0.8986013986013986            
wifi            0.3109452736318408             0.8741258741258742            


Predict 

In [12]:
text = "lumayan nyaman,tp kebersihan kmr mandi perlu ditingkatkan lg biar gk ada kuning2 di sudutnya lbh bgs"

start_time = time.time()

pred = predict(text, model_path="model/train1", feature="fasttext", classifier="nb")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 100.7978527545929 seconds ---
ac              0.8741258741258742            
air_panas       0.8741258741258742            
bau             0.8741258741258742            
general         0.8741258741258742            
kebersihan      0.8741258741258742            
linen           0.8741258741258742            
service         0.8741258741258742            
sunrise_meal    0.8741258741258742            
tv              0.8741258741258742            
wifi            0.8741258741258742            


#### Support Vector Machine Classifier

In [13]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="fasttext", classifier="svm", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.818881118881119
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.6040628209302908             0.9370629370629371            
air_panas       0.5306657347544994             0.8811188811188811            
bau             0.5030345471521942             0.8426573426573427            
general         0.324067046173308              0.8531468531468531            
kebersihan      0.2184834368530021             0.36363636363636365           
linen           0.5045463901299142             0.8111888111888111            
service         0.4139716026321894             0.7552447552447552            
sunrise_meal    0.3650273224043716             0.9265734265734266            
tv              0.5271407837445574             0.9370629370629371            
wifi            0.34814589870769647            0.8811188811188811            


Predict

In [14]:
start_time = time.time()

pred = predict(text, model_path="model/train1", feature="fasttext", classifier="svm")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 27.57681369781494 seconds ---
ac              0.8811188811188811            
air_panas       0.8811188811188811            
bau             0.8811188811188811            
general         0.8811188811188811            
kebersihan      0.8811188811188811            
linen           0.8811188811188811            
service         0.8811188811188811            
sunrise_meal    0.8811188811188811            
tv              0.8811188811188811            
wifi            0.8811188811188811            


#### Logistic Regression Classifier

In [16]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="fasttext", classifier="lr", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy: 0.8251748251748253
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.6                            0.9335664335664335            
air_panas       0.4825918762088975             0.9020979020979021            
bau             0.5061106110611061             0.8986013986013986            
general         0.3407337128399747             0.8566433566433567            
kebersihan      0.32840722495894914            0.4825174825174825            
linen           0.4114790286975718             0.7377622377622378            
service         0.27991886409736305            0.7237762237762237            
sunrise_meal    0.32                           0.9230769230769231            
tv              0.444056486654252              0.9195804195804196            
wifi            0.3109452736318408             0.8741258741258742            


Predict

In [17]:
start_time = time.time()

pred = predict(text, model_path="model/train1", feature="fasttext", classifier="lr")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 22.777360200881958 seconds ---
ac              0.8741258741258742            
air_panas       0.8741258741258742            
bau             0.8741258741258742            
general         0.8741258741258742            
kebersihan      0.8741258741258742            
linen           0.8741258741258742            
service         0.8741258741258742            
sunrise_meal    0.8741258741258742            
tv              0.8741258741258742            
wifi            0.8741258741258742            


### Train, Test and Save Classifier for TFIDF Vectorizer

#### Naive Bayes Classifier

In [24]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="tfidf", classifier="nb", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.8692307692307694
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.5065623023402909             0.8776223776223776            
air_panas       0.47470777135517017            0.8986013986013986            
bau             0.45163492388558946            0.8811188811188811            
general         0.35138188771075957            0.8496503496503497            
kebersihan      0.5238118995748172             0.7552447552447552            
linen           0.5363641662295273             0.8426573426573427            
service         0.6226459819873824             0.8181818181818182            
sunrise_meal    0.3650273224043716             0.9265734265734266            
tv              0.48686679174484054            0.9265734265734266            
wifi            0.49487841113456876            0.916083916083916             


Predict

In [25]:
start_time = time.time()

pred = predict(text, model_path="model/train1", classifier="nb", feature="tfidf")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 0.0409541130065918 seconds ---
ac              0.916083916083916             
air_panas       0.916083916083916             
bau             0.916083916083916             
general         0.916083916083916             
kebersihan      0.916083916083916             
linen           0.916083916083916             
service         0.916083916083916             
sunrise_meal    0.916083916083916             
tv              0.916083916083916             
wifi            0.916083916083916             


#### Support Vector Machine Classifier

In [26]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="tfidf", classifier="svm", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.9356643356643358
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.6957498807820697             0.9615384615384616            
air_panas       0.5817921557400515             0.9440559440559441            
bau             0.5768970718260779             0.9335664335664335            
general         0.5277777777777778             0.9055944055944056            
kebersihan      0.857421940716505              0.8881118881118881            
linen           0.7470994291782822             0.8881118881118881            
service         0.8162228075309725             0.9055944055944056            
sunrise_meal    0.7315720105439731             0.9615384615384616            
tv              0.6276878820568141             0.9790209790209791            
wifi            0.6512488262910798             0.9895104895104895            


Predict

In [27]:
start_time = time.time()

pred = predict(text, model_path="model/train1", classifier="svm", feature="tfidf")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 0.03307509422302246 seconds ---
ac              0.9895104895104895            
air_panas       0.9895104895104895            
bau             0.9895104895104895            
general         0.9895104895104895            
kebersihan      0.9895104895104895            
linen           0.9895104895104895            
service         0.9895104895104895            
sunrise_meal    0.9895104895104895            
tv              0.9895104895104895            
wifi            0.9895104895104895            


#### Logistic Regression Classifier

In [28]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="tfidf", classifier="lr", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy: 0.934965034965035
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.6252395032525132             0.951048951048951             
air_panas       0.5782552541974875             0.9405594405594405            
bau             0.5648059542796385             0.9265734265734266            
general         0.54263813478122               0.9125874125874126            
kebersihan      0.8744763061339712             0.8986013986013986            
linen           0.5937593375165408             0.8986013986013986            
service         0.8375841833879653             0.916083916083916             
sunrise_meal    0.5151994377086627             0.9440559440559441            
tv              0.6431978196684079             0.9790209790209791            
wifi            0.639856034798006              0.9825174825174825            


Predict

In [29]:
start_time = time.time()

pred = predict(text, model_path="model/train1", classifier="lr", feature="tfidf")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 0.028407812118530273 seconds ---
ac              0.9825174825174825            
air_panas       0.9825174825174825            
bau             0.9825174825174825            
general         0.9825174825174825            
kebersihan      0.9825174825174825            
linen           0.9825174825174825            
service         0.9825174825174825            
sunrise_meal    0.9825174825174825            
tv              0.9825174825174825            
wifi            0.9825174825174825            


### Train, Test and Save Classifier for BoW Vectorizer

#### Naive Bayes Classifier

In [18]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="bow", classifier="nb", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.9031468531468532
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.6163453577764636             0.9440559440559441            
air_panas       0.5462501384734685             0.9265734265734266            
bau             0.5509052351157614             0.916083916083916             
general         0.47007575757575754            0.8846153846153846            
kebersihan      0.7713700755650752             0.8321678321678322            
linen           0.6464597075510863             0.8776223776223776            
service         0.7155265840442085             0.8496503496503497            
sunrise_meal    0.3887486243283485             0.9125874125874126            
tv              0.5333910533910534             0.9335664335664335            
wifi            0.5976338028169015             0.9545454545454546            


Predict

In [19]:
start_time = time.time()

pred = predict(text, model_path="model/train1", classifier="nb", feature="bow")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 0.01363825798034668 seconds ---
ac              0.9545454545454546            
air_panas       0.9545454545454546            
bau             0.9545454545454546            
general         0.9545454545454546            
kebersihan      0.9545454545454546            
linen           0.9545454545454546            
service         0.9545454545454546            
sunrise_meal    0.9545454545454546            
tv              0.9545454545454546            
wifi            0.9545454545454546            


### Support Vector Machine Classifier

In [20]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="bow", classifier="svm", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

Average Accuracy: 0.9374125874125874
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.8080550839464252             0.9755244755244755            
air_panas       0.6092746357512141             0.958041958041958             
bau             0.5724327918685286             0.9265734265734266            
general         0.5321584846850086             0.8951048951048951            
kebersihan      0.8555216802168021             0.8811188811188811            
linen           0.7802514719610327             0.8986013986013986            
service         0.8307967237527923             0.9090909090909091            
sunrise_meal    0.7283582089552239             0.9615384615384616            
tv              0.7999450448800146             0.9825174825174825            
wifi            0.6456230396350157             0.986013986013986             


Predict

In [21]:
start_time = time.time()

pred = predict(text, model_path="model/train1", classifier="svm", feature="bow")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 0.020662784576416016 seconds ---
ac              0.986013986013986             
air_panas       0.986013986013986             
bau             0.986013986013986             
general         0.986013986013986             
kebersihan      0.986013986013986             
linen           0.986013986013986             
service         0.986013986013986             
sunrise_meal    0.986013986013986             
tv              0.986013986013986             
wifi            0.986013986013986             


#### Logistic Regression Classifier

In [22]:
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="bow", classifier="lr", save_path="model/train1")

print("Average Accuracy: {}".format(acc))
print("Categorical Score:")
print("{:<15} {:<30} {:<30}".format('Aspect', 'F1-score', 'Accuracy'))
for k, v in categorical.items():
    f1, acc = v.values()
    print ("{:<15} {:<30} {:<30}".format(k, f1, acc))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy: 0.9367132867132867
Categorical Score:
Aspect          F1-score                       Accuracy                      
ac              0.8035339238425715             0.972027972027972             
air_panas       0.6036145426389329             0.9545454545454546            
bau             0.5692245622528848             0.9230769230769231            
general         0.5384277558190601             0.9090909090909091            
kebersihan      0.8420432840035211             0.8776223776223776            
linen           0.7550739594083247             0.8986013986013986            
service         0.8395714669726164             0.9125874125874126            
sunrise_meal    0.7178608515057112             0.9615384615384616            
tv              0.6200575301057969             0.9755244755244755            
wifi            0.639856034798006              0.9825174825174825            


Predict

In [23]:
start_time = time.time()

pred = predict(text, model_path="model/train1", classifier="lr", feature="bow")

print("--- Time Execute: %s seconds ---" % (time.time() - start_time))
for k, v in pred.items():
    print ("{:<15} {:<30}".format(k, acc))

--- Time Execute: 0.03075695037841797 seconds ---
ac              0.9825174825174825            
air_panas       0.9825174825174825            
bau             0.9825174825174825            
general         0.9825174825174825            
kebersihan      0.9825174825174825            
linen           0.9825174825174825            
service         0.9825174825174825            
sunrise_meal    0.9825174825174825            
tv              0.9825174825174825            
wifi            0.9825174825174825            
