In [50]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns



In [51]:
df = pd.read_csv("D:/Data/PU/Semester_3/NlpProject/Nlp1/LanguageDetection.csv")

df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [52]:
def remove_pun(text):
    for pun in string.punctuation:
        text = text.replace(pun,"")
    text = text.lower()
    return text

In [53]:
df['Text'] = df['Text'].apply(remove_pun)

df.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,1 the word nature is borrowed from the old fre...,English


In [54]:
from sklearn.model_selection import train_test_split

In [55]:
x = df.iloc[:,0]
y = df.iloc[:,1]

In [56]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

In [57]:
from sklearn import feature_extraction

In [58]:
vector = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [59]:
from sklearn import pipeline
from sklearn import linear_model 

In [60]:
model = pipeline.Pipeline([('vector',vector),('clf',linear_model.LogisticRegression())])

In [61]:
model.fit(x_train,y_train)

In [62]:
model.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [63]:
model_pred = model.predict(x_test)

In [64]:
from sklearn import metrics

In [65]:
metrics.accuracy_score(y_test,model_pred) * 100

97.82398452611218

In [66]:
metrics.confusion_matrix(y_test,model_pred)

array([[120,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  84,   0,   3,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   2,   0,   0],
       [  0,   1,  92,   1,   1,   3,   0,   0,   0,   0,   0,   1,   0,
          0,   1,   0,   0],
       [  0,   0,   0, 269,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   1, 193,   0,   0,   0,   1,   0,   0,   1,   0,
          0,   0,   0,   0],
       [  0,   0,   1,   0,   1,  99,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   1],
       [  0,   0,   0,   0,   0,   0,  76,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   9,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   2,   0,   2,   0,   0,   0,   0, 138,   0,   0,   0,   0,
          2,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  72,   0,   0,   0,
         

In [67]:
model.predict(['मैं तुम्हें एक बात की सलाह देता हूँ'])

array(['Hindi'], dtype=object)

In [68]:
model.predict(['Arpan Chaudhary'])

array(['English'], dtype=object)

In [69]:
import pickle


In [70]:
pickle.dump(model,open('model_nlp1.pkl','wb'))

In [71]:
import os

In [72]:
os.listdir()


['app.py',
 'index.html',
 'language.ipynb',
 'LanguageDetection.csv',
 'model_nlp1.pkl']