## Importing Packages

In [102]:
# Standard libraries
import string
import numpy as np
import pandas as pd 

# Preprocessing
from sklearn.feature_extraction.text import CountVectorizer   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Building classification models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Model evaluation
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

## Loading Datasets

In [103]:
# loading datasets
df_train = pd.read_csv("train_set.csv")
df_test = pd.read_csv("test_set.csv")

In [104]:
# displaying datasets heads
display(df_train.head())

df_test.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


## Data Preprocessing

In [105]:
def clean_text(text):
    text = text.lower() # Convert text to lowercase
    text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

df_train['cleaned_text'] = df_train['text'].apply(lambda x: clean_text(x))
df_test['cleaned_text'] = df_test['text'].apply(lambda x: clean_text(x))

In [106]:
df_train.head()

Unnamed: 0,lang_id,text,cleaned_text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


## Modeling

In [107]:
X = df_train['cleaned_text']
y = df_train['lang_id']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2,random_state =42)

In [108]:
# Logistic Regression
lr = Pipeline([('tfidf' ,TfidfVectorizer()), ('cfl', LogisticRegression(max_iter=1000))])
lr.fit(X_train, y_train)
y_predict = lr.predict(X_test)

In [109]:
matrix = metrics.confusion_matrix(y_test, y_predict)

In [110]:
print(matrix)

[[577   1   0   0   0   5   0   0   0   0   0]
 [  0 614   0   0   0   1   0   0   0   0   0]
 [  0   0 576   0   0   0   0   0   0   1   6]
 [  0   0   0 621   0   0   1   0   0   3   0]
 [  0   0   1   0 617   0   0   0   0   0   0]
 [  0   0   0   0   0 582   0   0   0   0   2]
 [  0   0   0   0   0   1 597   0   0   0   0]
 [  0   0   0   0   0   0   0 561   0   0   0]
 [  0   0   0   0   0   0   0   0 634   0   0]
 [  0   0   1   0   0   0   0   0   0 607   1]
 [  0   1   5   0   0   1   0   0   0   3 580]]


In [111]:
# Classification Report
c_report = metrics.classification_report(y_test, y_predict)

In [112]:
print(c_report)

              precision    recall  f1-score   support

         afr       1.00      0.99      0.99       583
         eng       1.00      1.00      1.00       615
         nbl       0.99      0.99      0.99       583
         nso       1.00      0.99      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       0.99      1.00      0.99       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       0.99      1.00      0.99       609
         zul       0.98      0.98      0.98       590

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600



In [113]:
y_test = lr.predict(df_test['cleaned_text'])
kaggle_results = pd.DataFrame({'index' : df_test['index'],'lang_id' : y_test})
kaggle_results.to_csv('submission.csv', index = False)

In [114]:
kaggle_results

Unnamed: 0,index,lang_id
0,1,ssw
1,2,nbl
2,3,ven
3,4,ssw
4,5,ssw
...,...,...
5677,5678,ssw
5678,5679,nso
5679,5680,sot
5680,5681,sot
