# South African Language Identification EDSA 2021 Classification

## Importing Packages

In [1]:
# Standard libraries
import re
import numpy as np
import pandas as pd

# Preprocessing  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV

# Building classification model
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# Model evaluation
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

## Loading Datasets

In [2]:
# importing the datasets
df_train = pd.read_csv("train_set.csv")
df_test = pd.read_csv("test_set.csv")

In [3]:
#Checking df_train dataset head
display(df_train.head())

#Checking df_train dataset information
df_train.info()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


- The dataset has 33000 entries and 2 variables that do not have null values. There are two object variable.

In [4]:
#Checking df_test dataset head
display(df_test.head())

#Checking df_test dataset information
df_test.info()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5682 entries, 0 to 5681
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   5682 non-null   int64 
 1   text    5682 non-null   object
dtypes: int64(1), object(1)
memory usage: 88.9+ KB


- The dataset has 5682 entries and 2 variables that do not have null values. There is one interger variables and one object variable.

## Data Preprocessing

#### Data Cleaning

The data cleaning process will include the following:
- Converting text to lowercase
- Removal of the noise:
    - symbols,
    - numbers,
    - punctuations,
    - white spaces

In [5]:
def clean_text(text):
    
    text = text.lower() # Convert text to lowercase
    
    text = re.sub(r'#', '', text) # Remove # symbols   
    
    text = re.sub(r'\d+', '', text) # Remove numbers 
    
    text = re.sub(r"[—,.;':@#?!\&/$-_]+\ *", ' ', text) # Remove punctuation
    
    text = re.sub(r'\s\s+', ' ', text) # Remove extra whitespace
    
    text = text.lstrip(' ') # Remove space in front of text                       
    
    return text
df_train['clean_text'] = df_train['text'].apply(clean_text)
df_test['clean_text'] = df_test['text'].apply(clean_text)

In [6]:
df_test.head()

Unnamed: 0,index,text,clean_text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",mmasepala fa maemo a a kgethegileng a letlelel...
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu
3,4,Kube inja nelikati betingevakala kutsi titsini...,kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.,winste op buitelandse valuta


## Modeling

### Preparing for modeling

In [7]:
# Seperate features and tagret variables
X = df_train['clean_text']
y = df_train['lang_id']

# Split the train data to create validation dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2,random_state =42)

In [8]:
# Linear SVC:
lsvc = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC(class_weight='balanced'))])

### Base Models

In [9]:
# Linear SVC
lsvc.fit(X_train, y_train)
y_pred_lsvc = lsvc.predict(X_test)

In [10]:
# Classification Report for the Linear SVC model
print(metrics.classification_report(y_test, y_pred_lsvc))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       1.00      1.00      1.00       615
         nbl       0.99      0.99      0.99       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      1.00      1.00       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       1.00      1.00      1.00       609
         zul       0.99      0.99      0.99       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



### Kaggle Submission

In [11]:
y_test = lsvc.predict(df_test['clean_text'])

In [12]:
results = pd.DataFrame({'index' : df_test['index'],'lang_id' : y_test})

In [13]:
results.to_csv('submission.csv', index = False)

In [14]:
results

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
