In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score, precision_recall_curve, f1_score, classification_report
from langdetect import detect_langs
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from sklearn.model_selection import GridSearchCV
import re
import string
from sklearn.naive_bayes import GaussianNB ,CategoricalNB 
import emoji

In [2]:
df=pd.read_csv('train_set.csv')
df_test =pd.read_csv('test_set.csv')

In [3]:
display(df.head())
display(df_test.head())

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [4]:
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def clean_tex2t(input_text):
    text = input_text.lower()
    text = replace_numbers.sub('', text)

    text = text.replace('ã…â¡', 'š')
    text = text.replace('ï¿½', '')
    text = text.replace('ª', '')
    text = text.rstrip('"')
    text = text.lstrip(' "')

    text = text.replace('\n', '') 
    text = re.sub(r"\bhttps://t.co/\w+", 'url', text) 
    text = re.sub('\w*\d\w*', ' ', text) 
    text = re.sub(r'\s\s+', ' ', text)
    return text

In [5]:
def clean_text(text):
    text=re.sub('<.*?>', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub("\n"," ",text)
    text = text.lower()
    text=' '.join(text.split())
    
    return text

In [6]:
df['text'] = df['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

In [7]:
X = df.text
y=df.lang_id

In [8]:
X_train , X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [9]:
maxdf=0.9
mindf=1

In [10]:
vectorizer_dict = {'TF_1': TfidfVectorizer(max_df = maxdf, min_df = mindf,analyzer='char'),
                   'TF_2': TfidfVectorizer(ngram_range=(1,3), max_df = maxdf, min_df = mindf,analyzer='char'),
                   'TF_3': TfidfVectorizer(ngram_range=(2,3), max_df = maxdf, min_df = mindf,analyzer='char'),
                   'TF_4': TfidfVectorizer(ngram_range=(1,5),max_df = maxdf, min_df = mindf,analyzer='char'),
                   'TF_5' : TfidfVectorizer(ngram_range=(1, 6),analyzer='char',max_df=maxdf, min_df=mindf),
                  'TF_6' : TfidfVectorizer(ngram_range=(3, 6),analyzer='char',max_df=maxdf, min_df=mindf)}

In [11]:
model_dict = {  'Naive Bayes NB': MultinomialNB(), 
              'LinearSVC2' : LinearSVC(),}          


In [12]:
class_results_dict=defaultdict(list)

In [13]:
for vec_name, vectorizer in vectorizer_dict.items():
    
    X_train_cv = vectorizer.fit_transform(X_train)
    X_test_cv  = vectorizer.transform(X_test)
    print(vec_name) # keep track of progress
    
    for mod_name, model in model_dict.items():
        model.fit(X_train_cv, y_train);
        y_pred_cv = model.predict(X_test_cv)
        
        precision_cv = precision_score(y_test, y_pred_cv,average='macro')
        recall_cv = recall_score(y_test, y_pred_cv,average='macro')
        f1_cv = f1_score(y_test, y_pred_cv,average='macro')
        
        
        class_results_dict['Vectorizer Type'].append(vec_name)
        class_results_dict['Model Name'].append(mod_name)
        class_results_dict[('Precision')].append(precision_cv)
        class_results_dict[('Recall')].append(recall_cv)
        class_results_dict[('F1-score')].append(f1_cv)

print('Completed')
        

class_results_df = pd.DataFrame(class_results_dict)

TF_1
TF_2
TF_3
TF_4
TF_5
TF_6
Completed


In [14]:
class_results_df.sort_values(by='F1-score', ascending=False)

Unnamed: 0,Vectorizer Type,Model Name,Precision,Recall,F1-score
10,TF_6,Naive Bayes NB,0.999855,0.99984,0.999847
6,TF_4,Naive Bayes NB,0.999851,0.99984,0.999845
7,TF_4,LinearSVC2,0.999851,0.99984,0.999845
8,TF_5,Naive Bayes NB,0.999851,0.99984,0.999845
9,TF_5,LinearSVC2,0.999851,0.99984,0.999845
11,TF_6,LinearSVC2,0.999703,0.999686,0.999694
3,TF_2,LinearSVC2,0.999086,0.999073,0.999078
5,TF_3,LinearSVC2,0.999086,0.999073,0.999078
2,TF_2,Naive Bayes NB,0.998167,0.998142,0.998154
4,TF_3,Naive Bayes NB,0.998167,0.998142,0.998154


In [29]:
cv_final = vectorizer_dict['TF_6']

In [30]:
X = df.text
y= df.lang_id
X_test_final = df_test.text

In [31]:
cv_train_final = cv_final.fit_transform(X)
cv_test_final = cv_final.transform(X_test_final)

In [32]:
model = MultinomialNB()

In [33]:
model.fit(cv_train_final,y)

MultinomialNB()

In [34]:
pred_final = model.predict(cv_test_final)

In [35]:
submission = pd.DataFrame({'index': df_test.index +1, 'lang_id': pred_final})
submission.to_csv('rob_lang_ident_NB_submission.csv', index=False)