In [None]:
import pandas as pd
import os

from toolz.functoolz import pipe
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier


df = pd.DataFrame()
for json_file in os.listdir("dataset/jsonl/"):
    df = pd.concat([df, pd.read_json("dataset/jsonl/" + json_file, lines=True)])
    print(f"Added {json_file} to dataframe.")

In [None]:
%run -i "preprocess_nolang.ipynb"
%run -i "preprocess_lang.ipynb"
%run -i "evaluation.ipynb"

In [None]:
df["locale"] = df["locale"].apply(lambda x: x.split("-")[0])

params = [remove_punctuation, lowercase, tokenize, apply_stemming]


df = pipe(df, *params)


df["utt"] = df["utt"].apply(lambda x: " ".join(x))
df, encoder = encode_labels(df)

print(f"Finished preprocessing dataset.\n\n")

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df[df["partition"] != "test"]["utt"].values)

In [None]:
input_cols = [
    f"__{feature_name}" for feature_name in vectorizer.get_feature_names_out()
]
output_cols = ["intent"]
averages = [None, "macro", "weighted", "micro", "samples"]

In [None]:
training_df = df[df["partition"] != "test"]
testing_df = df[df["partition"] == "test"]

In [None]:
training_df.reset_index(drop=True, inplace=True)
testing_df.reset_index(drop=True, inplace=True)

In [None]:
indices = {}

for lang in testing_df["locale"].unique():
    indices[lang] = testing_df.index[(testing_df["locale"] == lang)].values

In [None]:
 nb = train_and_use_model(
  MultinomialNB(),
  {
   "model__alpha": 0.5,
  },
  n_iter=1,
 )

In [None]:
xgboost = train_and_use_model(
    XGBClassifier(),
    {
        "model__max_depth": 3,
        "model__n_estimators": 100,
    },
    n_iter=1,
)

In [None]:
sgd = train_and_use_model(
    SGDClassifier(),
    {
        "model__alpha": 0.0001,
        "model__max_iter": 1000,
        "model__tol": 1e-3,
    },
    n_iter=1,
)