# Importing Libraries

In [54]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction
from sklearn import pipeline
from sklearn import linear_model
from sklearn import metrics

# Importing Data

In [55]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


# Cleaning Dataset

In [56]:
# Remove Punctuation

def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun,"")
    text = text.lower()
    return text

In [57]:
df['Text'] = df['Text'].apply(remove_pun)

In [58]:
from sklearn.model_selection import train_test_split
X = df.iloc[:,0]
Y = df.iloc[:,1]

In [59]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2)

In [60]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2), analyzer='char')
model_pipe = pipeline.Pipeline([('vec', vec), ('clf', linear_model.LogisticRegression())])

In [61]:
model_pipe.fit(X_train,Y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

In [62]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [63]:
predict_val = model_pipe.predict(X_test)
metrics.accuracy_score(Y_test,predict_val)*100

97.5338491295938

In [64]:
metrics.confusion_matrix(Y_test, predict_val)

array([[114,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  73,   0,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   2,   0,   1],
       [  0,   1, 118,   2,   0,   2,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   0],
       [  0,   2,   1, 281,   0,   0,   0,   0,   2,   0,   0,   1,   0,
          1,   0,   0,   0],
       [  0,   0,   0,   1, 187,   1,   0,   0,   0,   0,   0,   1,   0,
          1,   0,   0,   0],
       [  0,   1,   1,   1,   1,  86,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,  70,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   9,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   1,   1,   0,   0,   0,   0,   0, 125,   0,   0,   0,   0,
          2,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  66,   0,   0,   0,
         

In [65]:
model_pipe.predict(['Jay Shri Ram'])

array(['English'], dtype=object)

In [66]:
model_pipe.predict(['जय श्री राम'])

array(['Hindi'], dtype=object)

In [67]:
model_pipe.predict(['ஜெய் ஸ்ரீ ராம்'])

array(['Tamil'], dtype=object)

# Converting pickle model

In [73]:
import pickle
new_file = open('model.pckl', 'wb')
pickle.dump(model_pipe, new_file)
new_file.close()

### Look into directories 'for model.pkl' file

In [76]:
import os
os.listdir()

['app.py',
 'Language Detection.csv',
 '.ipynb_checkpoints',
 'model.pckl',
 'Untitled.ipynb']