### 1. Importing all the essential libraries

In [1]:
import string 
import re
import codecs
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
import itertools

### 2. Loading the data

In [2]:
# Loading english raw data

english_df = pd.read_csv("english.txt", "utf-8", header=None, names=["english"])

  exec(code_obj, self.user_global_ns, self.user_ns)
  english_df = pd.read_csv("english.txt", "utf-8", header=None, names=["english"])


In [3]:
english_df.head()

Unnamed: 0,english
0,The Project Gutenberg eBook of The Life and Ad...
1,This eBook is for the use of anyone anywhere i...
2,most other parts of the world at no cost and w...
3,"whatsoever. You may copy it, give it away or r..."
4,of the Project Gutenberg License included with...


In [4]:
# Loading raw german data
kazakh_df = pd.read_csv("kazakh.txt", "utf-8", header=None, names=["kazakh"])
kazakh_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)
  kazakh_df = pd.read_csv("kazakh.txt", "utf-8", header=None, names=["kazakh"])


Unnamed: 0,kazakh
0,"Таң алдында бiр ғана сағат мызғығаны болмаса, ..."
1,Абайдың жайшылықтағы оқуынан бүгiнгi оқуының м...
2,"Парсы, түркi кiтаптары бұны бiресе Шираздың гү..."
3,"Оқи отырып, кей жайларды анық айқын етiп хатқа..."
4,Бұл кiтаптардан алған хабардың бәрi қазiр атта...


In [5]:
# Loading raw french data

russian_df = pd.read_csv("russian.txt", "utf-8", header=None, names=["russian"])
russian_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)
  russian_df = pd.read_csv("russian.txt", "utf-8", header=None, names=["russian"])


Unnamed: 0,russian
0,Федор Достоевский
1,ИДИОТ
2,ЧАСТЬ ПЕРВАЯ.
3,I.
4,"В конце ноября, в оттепель, часов в девять утр..."


### 3. Data Preprocessing

In [6]:
for char in string.punctuation:
    print(char, end = ' ')
punctuations_table = dict((ord(char), None) for char in string.punctuation)

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

In [7]:
# Cleaning the data for english dataset

data_eng = []
lang_eng = []

for i, line in english_df.iterrows():
    line = line['english']
    if len(line) !=0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(punctuations_table)
        data_eng.append(line)
        lang_eng.append("english")

In [8]:
# Cleaning the data for kazakh dataset

data_kaz = []
lang_kaz = []

for i, line in kazakh_df.iterrows():
    line = line['kazakh']
    if len(line) !=0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(punctuations_table)
        data_kaz.append(line)
        lang_kaz.append("kazakh")

In [9]:
# Cleaning the data for russian dataset

data_rus = []
lang_rus = []

for i, line in russian_df.iterrows():
    line = line['russian']
    if len(line) !=0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = line.translate(punctuations_table)
        data_rus.append(line)
        lang_rus.append("russian")

### 4. Transforming the data into a single dataset

In [10]:
df = pd.DataFrame({
    "Text" : data_eng+data_kaz+data_rus,
    "Language" : lang_eng+lang_kaz+lang_rus
})

print(df.shape)

(17667, 2)


In [11]:
df.head()

Unnamed: 0,Text,Language
0,the project gutenberg ebook of the life and ad...,english
1,this ebook is for the use of anyone anywhere i...,english
2,most other parts of the world at no cost and w...,english
3,whatsoever you may copy it give it away or reu...,english
4,of the project gutenberg license included with...,english


In [12]:
df.tail()

Unnamed: 0,Text,Language
17662,учительша прискакав в павловск явилась прямо к...,russian
17663,рогожин выдержал два месяца воспаления в мозгу...,russian
17664,лебедев келлер ганя птицын и многие другие лиц...,russian
17665,го января,russian
17666,,russian


In [13]:
df['Language'].value_counts()

english    9619
russian    4480
kazakh     3568
Name: Language, dtype: int64

### 5. Splitting the dataset

* Splitting the dataset into Independent and Dependent variables

In [14]:
x = df.iloc[:,0] # Independent Variable
y = df.iloc[:,1] # Dependent Variable

In [15]:
y.head()

0    english
1    english
2    english
3    english
4    english
Name: Language, dtype: object

In [16]:
x.head()

0    the project gutenberg ebook of the life and ad...
1    this ebook is for the use of anyone anywhere i...
2    most other parts of the world at no cost and w...
3    whatsoever you may copy it give it away or reu...
4    of the project gutenberg license included with...
Name: Text, dtype: object

In [17]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Applying TF-IDF Vectorizer

* `ngram_range` : It collets one , one two, one two three words 
* `analyzer` : We are not going word by word here we are going character by character that why we have used **char**.
* `classifier`: model building second step


In [18]:
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,3), analyzer='char')
x = vectorizer.fit_transform(df['Text'])

In [19]:
pipe_mnb = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('clf', MultinomialNB())
])

In [20]:
pipe_mnb.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', MultinomialNB())])

In [21]:
pipe_lang_det = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('_lang_det_clf', LogisticRegression())
])

In [22]:
pipe_lang_det.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('_lang_det_clf', LogisticRegression())])

## Model Prediction for Logistic Regression

In [23]:
lang_det_predicted = pipe_lang_det.predict(x_test)

## Model evaluation for Logistic Regression

In [24]:
lang_det_acc = (metrics.accuracy_score(y_test, lang_det_predicted))*100
print('The logistic regression has:',lang_det_acc,'% accuracy')

The logistic regression has: 99.88681380871533 % accuracy


## Model Prediction for MultinomialNB

In [25]:
mnb_predicted = pipe_mnb.predict(x_test)

## Model Evaluation for MultinomialNB

In [26]:
mnb_acc = (metrics.accuracy_score(y_test, mnb_predicted))*100
print('The MultinomialNB has :',mnb_acc,'% accuracy')

The MultinomialNB has : 99.51895868704018 % accuracy


## Evaluation matrix for Logistic Regression

In [27]:
matrix = metrics.confusion_matrix(y_test, lang_det_predicted)
print('Confusion matrix: \n', matrix)

Confusion matrix: 
 [[1959    0    0]
 [   0  721    4]
 [   0    0  850]]


In [28]:
import pickle

In [29]:
lang_det_file = open('lang_det_model.pckl', 'wb')
pickle.dump(pipe_lang_det, lang_det_file)
lang_det_file.close()

In [30]:
global LdLangDetectModel
ldLangDetectFile = open('lang_det_model.pckl', 'rb')
LdLangDetectModel = pickle.load(ldLangDetectFile)
ldLangDetectFile.close()