In [1]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [9]:
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun,"")
    text = text.lower()
    return(text)

In [11]:
df['Text'] =df['Text'].apply(remove_pun)

0         nature in the broadest sense is the natural p...
1        nature can refer to the phenomena of the physi...
2        the study of nature is a large if not the only...
3        although humans are part of nature human activ...
4        1 the word nature is borrowed from the old fre...
                               ...                        
10332    ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333    ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334    ಹೇಗೆ  ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335    ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...
10336    ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...
Name: Text, Length: 10337, dtype: object

In [12]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x = df.iloc[:,0]
y = df.iloc[:,1]

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = .2)

In [21]:
from sklearn import feature_extraction

In [24]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [35]:
from sklearn import pipeline
from sklearn import linear_model

In [38]:
model_pipe = pipeline.Pipeline([('vec',vec),('clf',linear_model.LogisticRegression())])

In [39]:
model_pipe.fit(x_train,y_train)

In [40]:
predict_val = model_pipe.predict(x_test)

In [44]:
from sklearn import metrics

In [47]:
metrics.accuracy_score(y_test,predict_val)*100

97.58220502901354

In [48]:
metrics.confusion_matrix(y_test,predict_val)

array([[102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  78,   2,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   2],
       [  0,   1, 100,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   0],
       [  0,   0,   0, 281,   1,   0,   0,   0,   1,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0, 185,   1,   0,   0,   2,   0,   0,   2,   0,
          1,   0,   0,   0],
       [  0,   0,   2,   0,   0,  99,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,  77,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  14,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   1,   1,   0,   0,   0, 134,   0,   0,   1,   0,
          4,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  77,   0,   0,   0,
         

In [50]:
model_pipe.predict(['आप कैसे हैं?'])

array(['Hindi'], dtype=object)

In [51]:
import pickle

In [52]:
new_file = open('model.pckl','wb')
pickle.dump(model_pipe,new_file)
new_file.close()