In [3]:
import pandas as pd


In [9]:
df = pd.read_csv('./Language-Detection.csv')

In [10]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [11]:
df.isnull().sum()

Text        0
Language    0
dtype: int64

In [12]:
df['Language'].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [13]:
X = df['Text']
y = df['Language']

In [14]:
import re
data_list = []
for text in X:
    text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

  text = re.sub(r'[[]]', ' ', text)


In [15]:
X = data_list

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [17]:
len(X_train)

6925

In [18]:
len(X_test)

3412

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

lang_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

lang_clf.fit(X_train, y_train)

In [20]:
Predictions = lang_clf.predict(X_test)

In [21]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [22]:
print(confusion_matrix(y_test, Predictions))

[[178   0   0   0   0   0   0   0   0   0   0   0   4   0   0   0   0]
 [  0 129   0   1   1   2   0   0   1   0   0   1   2   0   2   0   0]
 [  0   0 181   0   1   1   0   0   0   0   0   0   3   1   0   0   0]
 [  0   0   0 477   0   1   0   0   4   0   0   0   3   1   0   0   0]
 [  0   0   1   2 331   0   0   0   2   0   0   0   3   1   0   0   0]
 [  0   2   0   0   0 141   0   0   0   0   0   0   3   0   0   0   1]
 [  0   0   0   1   0   0 107   0   0   0   0   0   4   0   0   0   0]
 [  0   0   0   0   0   0   0  20   0   0   0   0   0   0   0   0   0]
 [  0   0   0   3   0   0   0   0 229   0   0   0   0   5   0   0   0]
 [  0   0   0   0   0   0   0   0   0 107   0   0   2   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 195   0   4   0   0   0   0]
 [  0   0   0   1   0   0   0   0   1   0   0 230   2   7   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 217   0   0   0   0]
 [  0   0   0   2   0   0   0   0   0   0   1   2   3 260   0   0   1]
 [  0 

In [23]:
print(classification_report(y_test, Predictions))

              precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       182
      Danish       0.96      0.93      0.94       139
       Dutch       0.99      0.97      0.98       187
     English       0.97      0.98      0.98       486
      French       0.99      0.97      0.98       340
      German       0.97      0.96      0.96       147
       Greek       1.00      0.96      0.98       112
       Hindi       1.00      1.00      1.00        20
     Italian       0.96      0.97      0.96       237
     Kannada       1.00      0.98      0.99       109
   Malayalam       0.99      0.97      0.98       200
  Portugeese       0.99      0.95      0.97       241
     Russian       0.83      1.00      0.91       217
     Spanish       0.94      0.97      0.95       269
    Sweedish       0.99      0.94      0.96       215
       Tamil       1.00      0.98      0.99       155
     Turkish       0.99      0.92      0.95       156

    accuracy              

In [24]:
print(accuracy_score(y_test, Predictions))

0.9671746776084408


In [25]:
# Arabic
lang_clf.predict(["مرحبا كيف حالك؟"])

array(['Arabic'], dtype=object)

In [26]:
# Danish
lang_clf.predict(["Hej hvordan går det?"])

array(['Sweedish'], dtype=object)

In [27]:
# Dutch
lang_clf.predict(["Hallo hoe gaat het?"])

array(['Dutch'], dtype=object)

In [28]:
# English
lang_clf.predict(["Hi, how are you?"])

array(['English'], dtype=object)

In [29]:
# French
lang_clf.predict(["Salut comment ça va?"])

array(['French'], dtype=object)

In [30]:
# German
lang_clf.predict(["Hi, wie geht es dir?"])

array(['German'], dtype=object)

In [31]:
# Hindi
lang_clf.predict(["नमस्ते, आप कैसे हैं?"])

array(['Hindi'], dtype=object)

In [32]:
# Kannada
lang_clf.predict(["ನಮಸ್ಕಾರ ಹೇಗಿದ್ದೀರಾ?"])

array(['Kannada'], dtype=object)

In [33]:
# Malayalam
lang_clf.predict(["ഹായ്, സുഖമാണോ?"])

array(['Russian'], dtype=object)

In [34]:
# Tamil
lang_clf.predict(["ஹாய், நீங்கள் எப்படி இருக்கிறீர்கள்?"])

array(['Tamil'], dtype=object)

import joblib 
joblib.dump(lang_clf,'LangDetect.joblib')


In [36]:
import joblib 
joblib.dump(lang_clf,'LangDetect.joblib')

['LangDetect.joblib']