# SemEval 2016 Slot1 Task: GloVe Average


This notebook is to make a baseline for slot1, aspect category detection. We use glove to generate sentence vector. 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
train_path = Path.cwd().parent.joinpath('datasets/semeval-2016/train.csv')
test_path = Path.cwd().parent.joinpath('datasets/semeval-2016/test.csv')

In [3]:
# Read data
data_train = pd.read_csv(train_path)
data_test = pd.read_csv(test_path)

In [4]:
def df2data(df):
    """Read data and labels from dataframe
    Input:
        df: three columns, ['Sentence #', 'Tag', 'Word']
    Output:
        data: datasize * ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
        label: datasize * ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
    """
    agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                 s["Tag"].values.tolist())]
    grouped = df.groupby("Sentence #").apply(agg_func)
    data = [[w[0] for w in s] for s in grouped]
    label = [[w[1] for w in s] for s in grouped]  
    
    return data, label

In [5]:
x_train_words, _ = df2data(data_train)
x_test_words, _ = df2data(data_test)

In [6]:
x_train_words[0][:10]

['judging',
 'from',
 'previous',
 'posts',
 'this',
 'used',
 'to',
 'be',
 'a',
 'good']

### embedding 

In [7]:
def load_glove(file):
    """Loads GloVe vectors in numpy array.
    Args:
        file (str): a path to a glove file.
    Return:
        dict: a dict of numpy arrays.
    """
    model = {}
    with open(file, encoding="utf-8") as f:
        for line in f:
            line = line.split(' ')
            word = line[0]
            vector = np.array([float(val) for val in line[1:]])
            model[word] = vector

    return model

In [10]:
EMBEDDING_PATH = '../embedding_weights/glove.840B.300d.txt'

In [None]:
embeddings = load_glove(EMBEDDING_PATH)

No need to padding

In [None]:
def words2vector(x_words, vector_size=200):

    sentence_vectors = []
    no_word_match = 0

    for sent in x_words:
        vector = []
        for word in sent:
            if embeddings.get(word) is not None:
                vector.append(embeddings[word])
        if len(vector) > 0:
            sentence_vectors.append(np.mean(vector, axis=0))
        else:
            sentence_vectors.append(np.array([0] * vector_size))
            no_word_match += 1

    if no_word_match > 0:
        print("{} sentences does not match any pretrained vector.".format(no_word_match))

    return np.array(sentence_vectors)

In [None]:
x_train = words2vector(x_train_words)
x_test = words2vector(x_test_words)

In [None]:
print(x_train.shape)
print(x_test.shape)

### label 

Read label from csv file. this is a multicalss classification

In [None]:
train_path = Path.cwd().parent.joinpath('datasets/semeval-2016/slot1/train_label_df.csv')
test_path = Path.cwd().parent.joinpath('datasets/semeval-2016/slot1/test_label_df.csv')

In [None]:
y_train_df = pd.read_csv(train_path)
y_test_df = pd.read_csv(test_path)

In [None]:
y_train_df.head(3)

# model construct

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

%matplotlib inline

# learning curve function
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

def plot_learning_curve(
        estimator, title, X, y, ylim=None, cv=None,
        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), verbose=0):

    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit


cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
ovr = OneVsRestClassifier(LogisticRegression())

plot_learning_curve(ovr, "Slot1 baseline learning curve ",
                    x_train, y_train_df, ylim=(0.0, 1.01), cv=cv, n_jobs=1, verbose=4)

plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


ovr = OneVsRestClassifier(LogisticRegression())
ovr.fit(x_train, y_train_df)

predicted = ovr.predict(x_test)
f1_score(y_test_df, predicted, average="micro")