# Dineo Kheswa Classification Hackathon
## Importing Libraries

In [1]:
# Packages for data analysis
import pandas as pd
import numpy as np
import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import nltk
import string
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import metrics

# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
%matplotlib inline


## Reading in the Dataset

In [2]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
train.head(2)

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...


In [4]:
test.head(2)

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...


In [5]:
sample_submission.head(2)

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


## Defining a text cleaning function

 This function uses regular expressions to remove html characters,
    punctuation, numbers and any extra white space from each text
    and then converts them to lowercase.

In [6]:
def clean_text(text):

    text=re.sub('<.*?>', ' ', text)
    text = re.sub("\n"," ",text)
    text = text.lower()
    text=' '.join(text.split())
    return text

In [7]:
train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

In [8]:
train["text"] = train["text"].str.replace(".txt", " text file")
test["text"] = test["text"].str.replace(".txt", " text file")

  train["text"] = train["text"].str.replace(".txt", " text file")
  test["text"] = test["text"].str.replace(".txt", " text file")


## Separating the features from the target column.

In [9]:
X = train['text']
y = train['lang_id']

## split the tarin data and the test data using train_test_split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01)

## Fitting the multinomial NB model using cross validation and grid search to obtain optimal parameters.

In [12]:
grid = {'alpha': [0.1, 1, 5, 10]} 

mnb = Pipeline([('tfidf', TfidfVectorizer(min_df=2,
                                                max_df=0.9,
                                                ngram_range=(1, 2))),
                      ('mnb', GridSearchCV(MultinomialNB(),
                                           param_grid=grid,
                                           cv=5,
                                           n_jobs=-1,
                                           scoring='f1_weighted'))
                      ])

mnb.fit(X_train, y_train)  # Fitting the model

y_pred = mnb.predict(X_val)  # predicting the fit on validation set

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00        44
         eng       1.00      1.00      1.00        32
         nbl       1.00      1.00      1.00        23
         nso       1.00      1.00      1.00        30
         sot       1.00      1.00      1.00        26
         ssw       1.00      1.00      1.00        35
         tsn       1.00      1.00      1.00        32
         tso       1.00      1.00      1.00        24
         ven       1.00      1.00      1.00        30
         xho       1.00      1.00      1.00        29
         zul       1.00      1.00      1.00        25

    accuracy                           1.00       330
   macro avg       1.00      1.00      1.00       330
weighted avg       1.00      1.00      1.00       330



## Creating a submission file.

In [14]:
submission_df = pd.DataFrame(test['index'])
submission_df['lang_id'] = mnb.predict(test['text'])
submission_df.to_csv('Multinomial_NB4.csv', index=False)