In [1]:
import numpy as np
import pandas as pd

# Regex
import re

from sklearn.preprocessing import LabelEncoder
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Models to use
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# Scoring metrics
from sklearn.metrics import accuracy_score, f1_score

import matplotlib.pyplot as plt
plt.style.use('seaborn')

## Import and Explore

In [2]:
df = pd.read_csv("train_set.csv")

In [3]:
df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
df.shape

(33000, 2)

In [5]:
unique_lang = df['lang_id'].unique()
print(list(unique_lang))
len(list(unique_lang))

['xho', 'eng', 'nso', 'ven', 'tsn', 'nbl', 'zul', 'ssw', 'tso', 'sot', 'afr']


11

In [6]:
# Categories are balanced
df['lang_id'].value_counts()

nso    3000
ven    3000
eng    3000
nbl    3000
xho    3000
tsn    3000
sot    3000
tso    3000
zul    3000
ssw    3000
afr    3000
Name: lang_id, dtype: int64

In [7]:
# Split the data between features and the label
X = df['text']
y = df['lang_id']

In [8]:
# Encode the labels to numbers
le = LabelEncoder()
y = le.fit_transform(y)

In [9]:
print(y)

[9 9 1 ... 1 9 4]


## Clean the Text

In [10]:
def preprocess(X):
    # Remove numbers and symbols from text
    X.replace(to_replace=r'[!@#$\(),"%^\*\.\+\*?\[\]\$\^\(\)\{\}\|\\/?:;~`0-9]*', value='', regex=True)
    # Change text to lowercase
    X.str.lower()
    return X

In [11]:
X = preprocess(X)

## Fit / Train on train/ test split data

In [12]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Implement a Bag of Words approach.
# Tfidf works better than count vectorizer due to weights being assigned to words. 
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

X_train = tfidf_vectorizer.fit_transform(X_train)
# Do NOT fit to the test set. ONLY TRANSFORM!
X_test = tfidf_vectorizer.transform(X_test)

In [14]:
# Check different models
names = [
    'Naive Bayes MN',
    'Naive Bayes Bernoulli',
    'Logistic Regression',  
    'Linear SVM',
    'Stochastic Gradient Descent'
]

In [15]:
classifiers = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(max_iter=1000), 
    SVC(),
    SGDClassifier()
]

In [16]:
scores = {}
for name, clf in zip(names, classifiers):
    print(f'Fitting {name} on training set...')
    clf.fit(X_train, y_train)
    print(f'Predicting {name} on training set...')
    y_pred = clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("F1_score is :", f1)
    print()
    scores[name] = f1

Fitting Naive Bayes MN on training set...
Predicting Naive Bayes MN on training set...
F1_score is : 0.9980299054262277

Fitting Naive Bayes Bernoulli on training set...
Predicting Naive Bayes Bernoulli on training set...
F1_score is : 0.9989392771541917

Fitting Logistic Regression on training set...
Predicting Logistic Regression on training set...
F1_score is : 0.9946996764597766

Fitting Linear SVM on training set...
Predicting Linear SVM on training set...
F1_score is : 0.9951627746828265

Fitting Stochastic Gradient Descent on training set...
Predicting Stochastic Gradient Descent on training set...
F1_score is : 0.9965130683137439



In [17]:
score_df = pd.DataFrame([scores])
score_df

Unnamed: 0,Naive Bayes MN,Naive Bayes Bernoulli,Logistic Regression,Linear SVM,Stochastic Gradient Descent
0,0.99803,0.998939,0.9947,0.995163,0.996513


## Fit / Train on all the training data

In [18]:
# Fit and transform all the availble data in the training csv file
tfidf_vectorizer_all = TfidfVectorizer(use_idf=True)

X = tfidf_vectorizer_all.fit_transform(X) 

In [20]:
# Fit Multinomial Naive Bayes to all the training data
naive_bayes_MN_all = MultinomialNB()
naive_bayes_MN_all.fit(X, y)

MultinomialNB()

## Submission

In [21]:
# Create a function to perform the necesary steps for submission
def submission(vectorizer, model, encoder):
    df_test = pd.read_csv("test_set.csv")
    # Split the data
    X_all_test = df_test['text']
    index = df_test['index']
    # Preprocess the data
    X_all_test = preprocess(X_all_test)
    # Transform the test set
    X_all_test = vectorizer.transform(X_all_test).toarray()
    # Predict with the model
    y_prediction_all_test = model.predict(X_all_test)
    # Reverse the encoder transform
    y_prediction_all_test = encoder.inverse_transform(y_prediction_all_test)
    # Convert to a dataframe
    df_predictions = pd.DataFrame(y_prediction_all_test, columns=['lang_id'])
    # Join the index and the predictions
    output = pd.DataFrame({"index":df_test['index']})
    submission = output.join(df_predictions)
    # Create the CSV to Submit
    submission.to_csv("lang_submission.csv", index=False)
    # Return the submission df
    return submission

In [22]:
submission(tfidf_vectorizer_all, naive_bayes_MN_all, le)

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
