In [27]:
import pandas as pd
import numpy as np

from scipy.sparse import vstack
from nltk import word_tokenize

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit

from gensim.models import FastText

In [50]:
df_train = pd.read_csv('dataset/train_preprocess.csv')

In [None]:
df_train.head()

In [51]:
colums = df_train.columns.to_list()
colums.remove('review')

In [11]:
for col in colums:
    count = df_train[col].value_counts()

    if not "neg_pos" in count:
        count['neg_pos'] = 0
        
    print("Columns: {}, neutral: {}, positive: {}, negative: {}, negative_positive: {}, total: {}".format(col, count['neut'], count['pos'], count['neg'], count['neg_pos'], count.sum()))

Columns: ac, neutral: 1814, positive: 51, negative: 417, negative_positive: 1, total: 2283
Columns: air_panas, neutral: 1922, positive: 26, negative: 335, negative_positive: 0, total: 2283
Columns: bau, neutral: 1911, positive: 12, negative: 360, negative_positive: 0, total: 2283
Columns: general, neutral: 2023, positive: 230, negative: 30, negative_positive: 0, total: 2283
Columns: kebersihan, neutral: 1350, positive: 205, negative: 722, negative_positive: 6, total: 2283
Columns: linen, neutral: 1613, positive: 63, negative: 606, negative_positive: 1, total: 2283
Columns: service, neutral: 1649, positive: 247, negative: 386, negative_positive: 1, total: 2283
Columns: sunrise_meal, neutral: 2108, positive: 75, negative: 100, negative_positive: 0, total: 2283
Columns: tv, neutral: 2075, positive: 13, negative: 195, negative_positive: 0, total: 2283
Columns: wifi, neutral: 1928, positive: 25, negative: 330, negative_positive: 0, total: 2283


In [38]:
def train_fasttext(xtrain):
    sentences = [word_tokenize(content.lower()) for content in xtrain]
    vectorizer = FastText(sentences, vector_size=300, window=3, min_count=1, workers=4, epochs=1000, sg=0, hs=0)
    vectorizer.save('model/test.ft')
    print('fasttext model saved at model/test.ft')

def norm_sent_vector(sentence, wv):
    vecs = [wv[word.lower()] for word in word_tokenize(sentence)] 
    norm_vecs = [vec / np.linalg.norm(vecs) for vec in vecs if np.linalg.norm(vecs) > 0]
    sent_vec = np.mean(norm_vecs, axis=0)
    return sent_vec

def hyperparam_tuning(xtrain, ytrain, xvalid, yvalid, classifier, param_grid):
    # combine train and valid
    x = vstack([xtrain, xvalid])
    y = ytrain + yvalid
    
    # create predefined split
    # -1 for all training and 0 for all validation
    ps = PredefinedSplit([-1] * len(ytrain) + [0] * len(yvalid))
    clf = GridSearchCV(classifier, param_grid, cv = ps)
    clf = clf.fit(x, y)
    return clf

def train_and_test(feature="bow", classifier="nb", aspect="ac", ft_path='model/test.ft'):
    xtrain = pd.read_csv('dataset/train_preprocess.csv')['review']
    xvalid = pd.read_csv('dataset/valid_preprocess.csv')['review']
    xtest = pd.read_csv('dataset/test_preprocess.csv')['review']

    # label_encoder = LabelEncoder()
    ytrain = list(pd.read_csv('dataset/train_preprocess.csv')[aspect])
    yvalid = list(pd.read_csv('dataset/valid_preprocess.csv')[aspect])
    ytest = list(pd.read_csv('dataset/test_preprocess.csv')[aspect])
    # ytest = label_encoder.fit_transform(pd.read_csv('dataset/test_preprocess.csv')[aspect])
    # yvalid = pd.read_csv('dataset/valid_preprocess.csv')[aspect]
    # ytest = pd.read_csv('dataset/test_preprocess.csv')[aspect]

    if feature == "bow":
        vectorizer = CountVectorizer()
    elif feature == "tfidf":
        vectorizer = TfidfVectorizer()
    elif feature == "fasttext":
        vectorizer = FastText.load(ft_path).wv
    else:
        raise Exception('Feature unknown. Use "bow" or "tfidf" or "fasttext"')

    # transform
    if feature == "bow" or feature == "tfidf":
        vectorizer.fit(xtrain)
        xtrain = vectorizer.transform(xtrain)
        xvalid = vectorizer.transform(xvalid)
        xtest = vectorizer.transform(xtest)
    elif feature == "fasttext":
        scaler = MinMaxScaler()
        xtrain = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xtrain])
        xvalid = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xvalid])
        xtest = scaler.fit_transform([norm_sent_vector(s, vectorizer) for s in xtest])

     # all classifiers
    classifier_model = {"nb" : MultinomialNB(),
                        "svm": SVC(),
                        "lr" : LogisticRegression(),
                       }
    # all params for grid-search
    param_grids = {"nb" : {"alpha": np.linspace(0.001,1,50)},
                   "svm": {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']},
                   "lr" : {'C': np.linspace(0.001,10,100)},
                  }

    clf = hyperparam_tuning(xtrain, ytrain, xvalid, yvalid,
                            classifier=classifier_model[classifier],
                            param_grid=param_grids[classifier])

    if feature == "bow" or feature == "tfidf":
        pred = clf.predict(xtest.toarray())
    else:
        pred = clf.predict(xtest)
        
    f1score = f1_score(ytest, pred, average='macro')
    accuracy = accuracy_score(ytest, pred)
    
    return f1score, accuracy, clf, vectorizer

In [45]:
f1, acc, _, _ = train_and_test(feature="fasttext")

In [53]:
for col in colums:
    f1, acc, _, _ = train_and_test(feature="bow")
    print("col: {}, f1: {}, acc: {}".format(col, f1, acc))

col: ac, f1: 0.6163453577764636, acc: 0.9440559440559441
col: air_panas, f1: 0.6163453577764636, acc: 0.9440559440559441
col: bau, f1: 0.6163453577764636, acc: 0.9440559440559441
col: general, f1: 0.6163453577764636, acc: 0.9440559440559441
col: kebersihan, f1: 0.6163453577764636, acc: 0.9440559440559441
col: linen, f1: 0.6163453577764636, acc: 0.9440559440559441
col: service, f1: 0.6163453577764636, acc: 0.9440559440559441
col: sunrise_meal, f1: 0.6163453577764636, acc: 0.9440559440559441
col: tv, f1: 0.6163453577764636, acc: 0.9440559440559441
col: wifi, f1: 0.6163453577764636, acc: 0.9440559440559441
