In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

## Data 

#### 1) Loading Data

In [2]:
train = pd.read_csv('../input/data-sa-language/train_set.csv') 
test = pd.read_csv('../input/data-sa-language/test_set.csv') 
sample_submission = pd.read_csv('../input/data-sa-language/sample_submission (1).csv') 

In [3]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
train['lang_id'].value_counts()

xho    3000
eng    3000
nso    3000
ven    3000
tsn    3000
nbl    3000
zul    3000
ssw    3000
tso    3000
sot    3000
afr    3000
Name: lang_id, dtype: int64

#### 2) Data Seperation (Predictors, Target) for the Training Set

In [5]:
X = train['text']
y = train['lang_id']

#### 3) Label Encode the Target 

In [6]:
la = LabelEncoder()
y = la.fit_transform(y)

## Preprocessing of the Text 

#### 1) Cleaning

In [7]:
data_list = []

for text in X:
       # symbols + number removal
        text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
        text = re.sub(r'[[]]', ' ', text)
        # lowercase
        text = text.lower()
        # appending to data_list
        data_list.append(text)

In [8]:
data_list[0:2]

['umgaqo-siseko we za amalu giselelo kumaziko axhasa ulawulo lwesi i zi ku ye  okuthath i xaxheba kwabafazi ezi ziquka phakathi kwezi ye zazo ikomisho i yokuli ga a  gokwesi i ikomisho i yamalu gelo olu tu lomza tsi afrika',
 'i-dha iya kuba  obulumko bokubeka umsebe zi  aphi  a kwisebe  gokusekwe kwiimfu o zokusebe za zalo emva kokubo a a  omsebe zi ku ye oka ye ima ya o yakhe ukuba ula dulo lom tu o jalo alufa eleka ga i-dha mayibize u cedo olufa elekileyo elu gelwe i layo']

#### 2) Create Bag of Words (Count Vectorize)

In [9]:
vec = CountVectorizer(max_features = 3128) #Important, use a fixed max_features in order to align with the test data.
X = vec.fit_transform(data_list).toarray()
X.shape 

(33000, 3128)

## Model Training and Evaluation

#### 1) Train Test Split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [11]:
x_train.shape

(26400, 3128)

In [12]:
y_train.shape

(26400,)

In [13]:
x_test.shape

(6600, 3128)

In [14]:
y_test.shape

(6600,)

#### 2) Model Fit and Predict

In [15]:
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [16]:
y_pred = model.predict(x_test)

#### 3) Model Evaluation

In [17]:
evalute = metrics.classification_report(y_test, y_pred)

In [18]:
print(evalute)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       609
           1       1.00      1.00      1.00       597
           2       0.98      0.99      0.99       559
           3       1.00      1.00      1.00       638
           4       1.00      1.00      1.00       571
           5       1.00      1.00      1.00       582
           6       1.00      1.00      1.00       603
           7       1.00      1.00      1.00       609
           8       1.00      1.00      1.00       604
           9       0.99      0.99      0.99       607
          10       0.99      0.98      0.99       621

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



## Prepare for Kaggle

## Data 

#### 1) Data Seperation (Predictors, Target) for the Training Set

In [19]:
Xtest = test['text']

## Preprocessing of the Text 

#### 1) Cleaning

In [20]:
data_list2 = []

for text in Xtest:
       # symbols + number removal
        text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
        text = re.sub(r'[[]]', ' ', text)
        # lowercase
        text = text.lower()
        # appending to data_list
        data_list2.append(text)

In [21]:
data_list2[0:2]

['mmasepala  fa maemo a a kgethegile g a letlelela kgato eo.',
 'uzakwaziswa  gokufa eleko  aku gafu eka emi ye imitlolo e gezelelako ukuqedelela ukutloliswa kwesibawo sakho.']

#### 2) Create Bag of Words (Count Vectorize)

In [22]:
vec2 = vec.transform(data_list2)
X_Test = vec2.toarray()
X_Test.shape

(5682, 3128)

In [23]:
fin = model.predict(X_Test)

In [24]:
fin

array([6, 2, 8, ..., 4, 4, 2])

In [25]:
results = test.copy()

In [26]:
results.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [27]:
results['lang_id']=pd.Series(fin)

In [28]:
results.head()

Unnamed: 0,index,text,lang_id
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",6
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,2
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,8
3,4,Kube inja nelikati betingevakala kutsi titsini...,5
4,5,Winste op buitelandse valuta.,0


In [29]:
results['lang_id'] = la.inverse_transform(results['lang_id'])

In [30]:
results.head()

Unnamed: 0,index,text,lang_id
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",tsn
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,nbl
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,ven
3,4,Kube inja nelikati betingevakala kutsi titsini...,ssw
4,5,Winste op buitelandse valuta.,afr


In [31]:
results = results.drop(['text'], axis=1)

In [32]:
results

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


In [33]:
sample_submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [34]:
results.to_csv("solution_fin4.csv", index=False)