# South African Language Identification
   EDSA 2022 Classification Hackathon

### BY ETENG UKET EFFIOM

## Challenge description 

We are required to use NLP's Language Identification to classify a given text into one of the 11 Official South African Languages, determining the natural language that a piece of text is written in.

## Notebook Outline

1. Package Installation
2. Importing Libraries
3. Loading the Data
4. Data Cleaning and Formating
5. Exploratory Data Analysis
6. Moddel
9. Submission

## 1. Package Installation

All the packages required has been installed already

## 2. Importing Libraries 

In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import matplotlib.pyplot as plt
#import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
import re

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## 3. loading the data 

In [None]:
df_train = pd.read_csv("train_set.csv")
df_test = pd.read_csv("test_set.csv")

### 3.1 Viewing the Data

In [None]:
df_train.head()

In [None]:
df_test.head()

### 4. Data Cleaning and Formating

In [None]:
df_train.info

In [None]:
df_test.info

### unique values per language category

In [None]:
df_train['lang_id'].value_counts()

In [None]:
df_train['text_'] = df_train['text'].str.lower()
df_test['text_'] = df_test['text'].str.lower ()

In [None]:
def removing_punctuations(data):

    words = str.maketrans('', '')
    return data.translate(words)
df_train['text_'] = df_train.text_.apply(lambda x: removing_punctuations(x))
df_test['text_'] = df_test.text_.apply(lambda x: removing_punctuations(x))

In [None]:
def text(data):
    text = re.sub(r'[09]', '',text)

## 5. Exploratory Data Analysis

In [None]:
# Checking whether a character is white-space character or not
print(len(df_train['text']))
print(sum(df_train['text'].apply(lambda x: x.isspace())))

In [None]:
# Visualizing the distribution of the target 
plt.hist(df_train['lang_id'], label='text');
plt.legend();
plt.title('Distribution')

## 6. Modelling

In [None]:
X = df_train['text']
y = df_train['lang_id']

In [None]:
# Use 80% of the train set to train the model, 20% to validate.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
tree = Pipeline([('tfidf', TfidfVectorizer()),('tree', DecisionTreeClassifier()),])
rfc = Pipeline([('tfidf', TfidfVectorizer()), ('rfc', RandomForestClassifier())])
ridge = Pipeline([('tfidf', TfidfVectorizer()), ('Ridge', RidgeClassifier())])
Lsvc = Pipeline([('tfidf', TfidfVectorizer()), ('scv', LinearSVC()),])
logreg = Pipeline([('tfidf', TfidfVectorizer()),('logistic', LogisticRegression()),])
SGD = Pipeline([('tfidf', TfidfVectorizer()), ('SGD', SGDClassifier())])
svc = Pipeline([('tfidf', TfidfVectorizer()), ('SVC', SVC())])
MNB = Pipeline([('tfidf', TfidfVectorizer()), ('MNB', MultinomialNB())])

In [None]:
tree.fit(X_train, y_train)

In [None]:
# training the RandomForest pipleline
rfc.fit(X_train, y_train)

In [None]:
ridge.fit(X_train, y_train)

In [None]:
svc.fit(X_train, y_train)

In [None]:
logreg.fit(X_train, y_train)

In [None]:
Lsvc.fit(X_train, y_train)

In [None]:
SGD.fit(X_train, y_train)

In [None]:
nbm.fit(X_train, y_train)

### model prediction

### Desicion Tree classifier

In [None]:
predictions = tree.predict(X_test)
confusion_matrix(predictions, y_test)

### model performance

In [None]:
print(classification_report(y_test, predictions))

### Random Forest Classifier 

In [None]:
predictions = rfc.predict(X_test)
confusion_matrix(predictions, y_test)

### model performance

In [None]:
print(classification_report(y_test, predictions))

### Ridge Classifier

In [None]:
predictions = ridge.predict(X_test)
confusion_matrix(predictions, y_test)

### model performance

In [None]:
print(classification_report(y_test, predictions))

### SVC

In [None]:
predictions = svc.predict(X_test)
confusion_matrix(predictions, y_test)

### model performance

In [None]:
print(classification_report(y_test, predictions))

### LogisticRegression

In [None]:
predictions = logreg.predict(X_test)
confusion_matrix(predictions, y_test)

### model performance

In [None]:
print(classification_report(y_test, predictions))

### LinearSVC

In [None]:
predictions = Lsvc.predict(X_test)
confusion_matrix(predictions, y_test)

### model performance

In [None]:
print(classification_report(y_test, predictions))

### SGDClassifier

In [None]:
predictions = SGD.predict(X_test)
confusion_matrix(predictions, y_test)

### model performance

In [None]:
print(classification_report(y_test, predictions))

### MultinomialNB

In [None]:
predictions = MNB.predict(X_test)
confusion_matrix(predictions, y_test)

In [None]:
print(classification_report(y_test, predictions))

## 7. Submission

In [None]:
test  = df_test['text']
pred = MNB.predict(test)

In [None]:
#kaggle submission

submission = pd.DataFrame(
    {'index' : df_test['index'],
     'lang_id': pred
    })

submission.to_csv('Eteng_Uket_Main.csv', index=False)

In [None]:
submission