In [98]:
import pandas as pd
import numpy as np

import os
import pickle

from scipy.sparse import vstack
from nltk import word_tokenize

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit

from gensim.models import FastText

In [50]:
df_train = pd.read_csv('dataset/train_preprocess.csv')

In [None]:
df_train.head()

In [51]:
colums = df_train.columns.to_list()
colums.remove('review')

In [101]:
colums

['ac',
 'air_panas',
 'bau',
 'general',
 'kebersihan',
 'linen',
 'service',
 'sunrise_meal',
 'tv',
 'wifi']

In [11]:
for col in colums:
    count = df_train[col].value_counts()

    if not "neg_pos" in count:
        count['neg_pos'] = 0
        
    print("Columns: {}, neutral: {}, positive: {}, negative: {}, negative_positive: {}, total: {}".format(col, count['neut'], count['pos'], count['neg'], count['neg_pos'], count.sum()))

Columns: ac, neutral: 1814, positive: 51, negative: 417, negative_positive: 1, total: 2283
Columns: air_panas, neutral: 1922, positive: 26, negative: 335, negative_positive: 0, total: 2283
Columns: bau, neutral: 1911, positive: 12, negative: 360, negative_positive: 0, total: 2283
Columns: general, neutral: 2023, positive: 230, negative: 30, negative_positive: 0, total: 2283
Columns: kebersihan, neutral: 1350, positive: 205, negative: 722, negative_positive: 6, total: 2283
Columns: linen, neutral: 1613, positive: 63, negative: 606, negative_positive: 1, total: 2283
Columns: service, neutral: 1649, positive: 247, negative: 386, negative_positive: 1, total: 2283
Columns: sunrise_meal, neutral: 2108, positive: 75, negative: 100, negative_positive: 0, total: 2283
Columns: tv, neutral: 2075, positive: 13, negative: 195, negative_positive: 0, total: 2283
Columns: wifi, neutral: 1928, positive: 25, negative: 330, negative_positive: 0, total: 2283


In [99]:
def train_fasttext(xtrain):
    sentences = [word_tokenize(content.lower()) for content in xtrain]
    vectorizer = FastText(sentences, vector_size=300, window=3, min_count=1, workers=4, epochs=1000, sg=0, hs=0)
    vectorizer.save('model/test.ft')
    print('fasttext model saved at model/test.ft')

def norm_sent_vector(sentence, wv):
    vecs = [wv[word.lower()] for word in word_tokenize(sentence)] 
    norm_vecs = [vec / np.linalg.norm(vecs) for vec in vecs if np.linalg.norm(vecs) > 0]
    sent_vec = np.mean(norm_vecs, axis=0)
    return sent_vec

def hyperparam_tuning(xtrain, ytrain, xvalid, yvalid, classifier, param_grid):
    # combine train and valid
    x = vstack([xtrain, xvalid])
    y = ytrain + yvalid
    
    # create predefined split
    # -1 for all training and 0 for all validation
    ps = PredefinedSplit([-1] * len(ytrain) + [0] * len(yvalid))
    clf = GridSearchCV(classifier, param_grid, cv = ps)
    clf = clf.fit(x, y)
    return clf

def train_and_test(data_train, data_valid, data_test, feature="bow", classifier="nb", save_path=None, ft_path="model/test.ft"):
    xtrain = data_train['review']
    xvalid = data_valid['review']
    xtest = data_test['review']

    colums = data_train.columns.to_list()
    colums.remove('review')

    if feature == "bow":
        vectorizer = CountVectorizer()
    elif feature == "tfidf":
        vectorizer = TfidfVectorizer()
    elif feature == "fasttext":
        vectorizer = FastText.load(ft_path).wv
    else:
        raise Exception('Feature unknown. Use "bow" or "tfidf" or "fasttext"')

    # transform
    if feature == "bow" or feature == "tfidf":
        vectorizer.fit(xtrain)
        xtrain = vectorizer.transform(xtrain)
        xvalid = vectorizer.transform(xvalid)
        xtest = vectorizer.transform(xtest)
    elif feature == "fasttext":
        scaler = MinMaxScaler()
        xtrain = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xtrain])
        xvalid = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xvalid])
        xtest = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xtest])

    # all classifiers
    classifier_model = {"nb" : MultinomialNB(),
                        "svm": SVC(),
                        "lr" : LogisticRegression(),
                    }
    # all params for grid-search
    param_grids = {"nb" : {"alpha": np.linspace(0.001,1,50)},
                "svm": {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']},
                "lr" : {'C': np.linspace(0.001,10,100)},
                }

    categorical = {}
    average_acc = 0
    for col in colums:
        ytrain = list(data_train[col])
        yvalid = list(data_valid[col])
        ytest = list(data_test[col])

        clf = hyperparam_tuning(xtrain, ytrain, xvalid, yvalid,
                                classifier=classifier_model[classifier],
                                param_grid=param_grids[classifier])

        if feature == "bow" or feature == "tfidf":
            pred = clf.predict(xtest.toarray())
        else:
            pred = clf.predict(xtest)

        f1 = f1_score(ytest, pred, average='macro')
        acc = accuracy_score(ytest, pred) 
        average_acc += acc

        categorical[col] = {'f1': f1, 'acc': acc}
        
        if save_path is not None:
            filename = save_path+'/'+feature+'/'+col
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename, 'wb') as fout:
                pickle.dump((vectorizer, clf), fout)

    average_acc = average_acc / len(colums)
    return average_acc, categorical
    # return f1score, accuracy, clf, vectorizer

In [107]:
def predict(text, model_path, feature='bow'):

    colums = ['ac', 'air_panas', 'bau', 'general', 'kebersihan', 'linen', 'service', 'sunrise_meal', 'tv', 'wifi']
    pred = {}
    for col in colums:
        with open(model_path+'/'+feature+'/'+col, 'rb') as f:
            vectorizer, clf = pickle.load(f)

            if feature == "bow" or feature == "tfidf":
                x = vectorizer.transform([text])
                pred[col] = clf.predict(x.toarray())[0]
            elif feature == "fasttext":
                scaler = MinMaxScaler()
                x = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in [text]])
                pred[col] = clf.predict(x)[0]

    return pred

In [113]:
data_train = pd.read_csv("dataset/train_preprocess.csv")
data_valid = pd.read_csv("dataset/valid_preprocess.csv")
data_test = pd.read_csv("dataset/test_preprocess.csv")
acc, categorical = train_and_test(data_train, data_valid, data_test, feature="tfidf", save_path="model/train1")

In [111]:
categorical

{'ac': {'f1': 0.36887159533073927, 'acc': 0.8321678321678322},
 'air_panas': {'f1': 0.3075957313245449, 'acc': 0.8566433566433567},
 'bau': {'f1': 0.3069182389937107, 'acc': 0.8531468531468531},
 'general': {'f1': 0.3062381852551985, 'acc': 0.8496503496503497},
 'kebersihan': {'f1': 0.27294255621461916, 'acc': 0.5699300699300699},
 'linen': {'f1': 0.27191166321601107, 'acc': 0.6888111888111889},
 'service': {'f1': 0.27991886409736305, 'acc': 0.7237762237762237},
 'sunrise_meal': {'f1': 0.32, 'acc': 0.9230769230769231},
 'tv': {'f1': 0.31553100061387357, 'acc': 0.8986013986013986},
 'wifi': {'f1': 0.3109452736318408, 'acc': 0.8741258741258742}}

In [114]:
predict("lumayan nyaman,tp kebersihan kmr mandi perlu ditingkatkan lg biar gk ada kuning2 di sudutnya lbh bgs", model_path="model/train1", feature="tfidf")

{'ac': 'neut',
 'air_panas': 'neut',
 'bau': 'neut',
 'general': 'neut',
 'kebersihan': 'neg',
 'linen': 'neut',
 'service': 'neut',
 'sunrise_meal': 'neut',
 'tv': 'neut',
 'wifi': 'neut'}