In [None]:
import pandas as pd
import os

df = pd.DataFrame()
"""
for json_file in os.listdir("dataset/jsonl/"):
    df = pd.concat([df, pd.read_json("dataset/jsonl/" + json_file, lines=True)])
    print(f"Added {json_file} to dataframe.")
"""
df = pd.read_json('dataset/jsonl/en-US.jsonl', lines=True)
df["locale"] = df["locale"].apply(lambda x: x.split("-")[0])

In [None]:
%run -i "preprocess_lang.ipynb"
%run -i "preprocess_nolang.ipynb"
%run -i "evaluation.ipynb"

In [None]:
from toolz.functoolz import pipe

params = [
    remove_punctuation,
    lowercase,
    tokenize
]

df = pipe(
    df,
    *params
)


In [None]:
import numpy as np

def tokens_to_mean_vector(embeddings, tokens):
    # convert tokens to embedding vectors, up to sequence_len tokens
    vec = []
    n = 0
    i = 0
    while i < len(tokens):   # while there are tokens
        try:
            vec.append(embeddings.get_vector(tokens[i]))
            n += 1
        except KeyError:
            True   # simply ignore out-of-vocabulary tokens
        finally:
            i += 1
    
    # return the mean of vec
    return np.mean(vec, axis=0)    


In [None]:
from gensim.models import Word2Vec

model = Word2Vec.load(f'D:\models\en.model')
print(f"Loaded en model.")

In [None]:
for lang in df['locale'].unique():
    print(f"Adding {lang} embeddings to dataframe.")
    # apply tokens_to_mean_vector to each utterance with locale == lang
    for i, row in df[df['locale'] == lang].iterrows():
        df.at[i, 'utt'] = tokens_to_mean_vector(model.wv, row['utt'])
    print(f"Added {lang} embeddings to dataframe. Deleting model.")


In [None]:
from sklearn.manifold import TSNE 

NUM_DIM = 50

input_cols = [f'utt_{i}' for i in range(NUM_DIM)]
tsne = TSNE(n_components=NUM_DIM, random_state=0, perplexity=2)


In [None]:
df['utt'] = tsne.fit_transform(df['utt'].values)

In [None]:

for i in range(300):
    df[f'utt_{i}'] = df['utt'].apply(lambda x: x[i])

In [None]:
df.head()

In [None]:
output_cols = ['intent']
averages = [None, "macro", "weighted", "micro", "samples"]

In [None]:
training_inputs = df[df['partition'] != 'test'][input_cols].values
testing_inputs = df[df['partition'] == 'test'][input_cols].values
training_classes = df[df['partition'] != 'test'][output_cols].values
testing_classes = df[df['partition'] == 'test'][output_cols].values

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

nb = MultinomialNB(alpha=0.1)

nb.fit(training_inputs, training_classes)
score = nb.score(testing_inputs, testing_classes)
f1_score = f1_score(testing_classes, nb.predict(testing_inputs), average="macro")

print(f"Score: {score}")
print(f"F1 Score: {f1_score}")

In [None]:
from xgboost import XGBClassifier

xgboost = XGBClassifier(subsample=0.7, n_estimators=200, min_child_weight=3, max_depth=3, max_delta_step=1, learning_rate=0.05, gamma=0.5, booster='gblinear')

xgboost.fit(training_inputs, training_classes)
score = xgboost.score(testing_inputs, testing_classes)
f1_score = f1_score(testing_classes, xgboost.predict(testing_inputs), average="macro")

print(f"Score: {score}")
print(f"F1 Score: {f1_score}")

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None)

knn.fit(training_inputs, training_classes)
score = knn.score(testing_inputs, testing_classes)
f1_score = f1_score(testing_classes, knn.predict(testing_inputs), average="macro")

print(f"Score: {score}")
print(f"F1 Score: {f1_score}")