In [1]:
import numpy as np
import string
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import datetime

In [2]:
import seaborn as sns 
import matplotlib.pyplot as plt

In [3]:
def preprocess(text):
    
    preprocessed_text = text.lower().replace('-', ' ')
    translation_table = str.maketrans('\n', ' ', string.punctuation+string.digits)
    preprocessed_text = text.translate(translation_table)
    
    return preprocessed_text

In [4]:
# df_train["lang"].unique()

In [5]:
df_train = pd.read_csv('../data/qald-7-train.csv')

In [6]:
vectorizer = CountVectorizer()

In [7]:
df_train['questions'] = [preprocess(text) for text in df_train['questions'].values]

In [8]:
X_train = vectorizer.fit_transform(df_train["questions"])
y_train = df_train["lang"]

In [9]:
naive_classifier = MultinomialNB(fit_prior=False,alpha=0.01)
naive_classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.01, fit_prior=False)

In [10]:
l = ["../data/qald-3-test.csv","../data/qald-4-test.csv","../data/qald-5-test.csv","../data/qald-6-test.csv","../data/qald-7-test.csv","../data/qald-8-test.csv"]

scores = []
times = []

for d in l:
    df_test = pd.read_csv(d)
    df_test=df_test.dropna(subset=['questions'])
    df_test=df_test[df_test["lang"]=='en']
    df_test['questions'] = [preprocess(text) for text in df_test['questions'].values]
    
    X_test = vectorizer.transform(df_test["questions"])
    y_test = df_test["lang"]
    
    a = datetime.datetime.now()
    predictions = naive_classifier.predict(X_test)
    b = datetime.datetime.now()
    c = b-a
    
    times.append(c/df_test.shape[0])
    
    scores.append(accuracy_score(y_test, predictions))

In [11]:
scores

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

In [12]:
times

[datetime.timedelta(microseconds=3),
 datetime.timedelta(microseconds=5),
 datetime.timedelta(microseconds=3),
 datetime.timedelta(microseconds=3),
 datetime.timedelta(microseconds=5),
 datetime.timedelta(microseconds=5)]

In [13]:
l = ["../data/qald-3-test.csv","../data/qald-4-test.csv","../data/qald-5-test.csv","../data/qald-6-test.csv","../data/qald-7-test.csv"]

scores = []
times = []

for d in l:
    df_test = pd.read_csv(d)
    df_test=df_test.dropna(subset=['questions'])
    df_test=df_test[df_test["lang"]=='de']
    df_test['questions'] = [preprocess(text) for text in df_test['questions'].values]
    
    X_test = vectorizer.transform(df_test["questions"])
    y_test = df_test["lang"]
    
    a = datetime.datetime.now()
    predictions = naive_classifier.predict(X_test)
    b = datetime.datetime.now()
    c = b-a
    
    times.append(c/df_test.shape[0])
    
    scores.append(accuracy_score(y_test, predictions))

In [14]:
scores

[1.0, 1.0, 1.0, 0.99, 1.0]

In [15]:
times

[datetime.timedelta(microseconds=5),
 datetime.timedelta(microseconds=6),
 datetime.timedelta(microseconds=4),
 datetime.timedelta(microseconds=2),
 datetime.timedelta(microseconds=4)]

In [16]:
l = ["../data/qald-3-test.csv","../data/qald-4-test.csv","../data/qald-5-test.csv","../data/qald-6-test.csv","../data/qald-7-test.csv"]

scores = []
times = []

for d in l:
    df_test = pd.read_csv(d)
    df_test=df_test.dropna(subset=['questions'])
    df_test=df_test[df_test["lang"]=='fr']
    df_test['questions'] = [preprocess(text) for text in df_test['questions'].values]
    
    X_test = vectorizer.transform(df_test["questions"])
    y_test = df_test["lang"]
    
    a = datetime.datetime.now()
    predictions = naive_classifier.predict(X_test)
    b = datetime.datetime.now()
    c = b-a
    
    times.append(c/df_test.shape[0])
    
    scores.append(accuracy_score(y_test, predictions))

In [17]:
scores

[0.98989898989899, 1.0, 1.0, 0.99, 0.9767441860465116]

In [18]:
times

[datetime.timedelta(microseconds=3),
 datetime.timedelta(microseconds=4),
 datetime.timedelta(microseconds=9),
 datetime.timedelta(microseconds=2),
 datetime.timedelta(microseconds=4)]