### Task #1: Perform imports and load the dataset into a pandas DataFrame

In [37]:
!pip install pandas



You should consider upgrading via the 'C:\Python\Python3.9.7\python.exe -m pip install --upgrade pip' command.


In [38]:
import pandas as pd

In [39]:
df = pd.read_csv('Language Detection.csv')

In [40]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [41]:
!pip install sklearn



You should consider upgrading via the 'C:\Python\Python3.9.7\python.exe -m pip install --upgrade pip' command.


### Task #2: Check for missing values:

In [42]:
df.isnull().sum()

Text        0
Language    0
dtype: int64

### Task #3: Check for different target labels:

In [43]:
df['Language'].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

### Task #4: Split the data into train & test sets:

In [44]:
X = df['Text']
y = df['Language']

In [45]:
import re
data_list = []
for text in X:
    text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_list.append(text)

In [46]:
X = data_list

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [48]:
len(X_train)

6925

In [49]:
len(X_test)

3412

### Task #5: Build a pipeline to vectorize the date, then train and fit a model

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

lang_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

lang_clf.fit(X_train, y_train)

In [51]:
Predictions = lang_clf.predict(X_test)

### Task #6: Run predictions and analyze the results

In [52]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [53]:
print(confusion_matrix(y_test, Predictions))

[[178   0   0   0   0   0   0   0   0   0   0   0   4   0   0   0   0]
 [  0 129   0   1   1   2   0   0   1   0   0   1   2   0   2   0   0]
 [  0   0 181   0   1   1   0   0   0   0   0   0   3   1   0   0   0]
 [  0   0   0 477   0   1   0   0   4   0   0   0   3   1   0   0   0]
 [  0   0   1   2 331   0   0   0   2   0   0   0   3   1   0   0   0]
 [  0   2   0   0   0 141   0   0   0   0   0   0   3   0   0   0   1]
 [  0   0   0   1   0   0 107   0   0   0   0   0   4   0   0   0   0]
 [  0   0   0   0   0   0   0  20   0   0   0   0   0   0   0   0   0]
 [  0   0   0   3   0   0   0   0 229   0   0   0   0   5   0   0   0]
 [  0   0   0   0   0   0   0   0   0 107   0   0   2   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 195   0   4   0   0   0   0]
 [  0   0   0   1   0   0   0   0   1   0   0 230   2   7   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 217   0   0   0   0]
 [  0   0   0   2   0   0   0   0   0   0   1   2   3 260   0   0   1]
 [  0 

In [54]:
print(classification_report(y_test, Predictions))

              precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       182
      Danish       0.96      0.93      0.94       139
       Dutch       0.99      0.97      0.98       187
     English       0.97      0.98      0.98       486
      French       0.99      0.97      0.98       340
      German       0.97      0.96      0.96       147
       Greek       1.00      0.96      0.98       112
       Hindi       1.00      1.00      1.00        20
     Italian       0.96      0.97      0.96       237
     Kannada       1.00      0.98      0.99       109
   Malayalam       0.99      0.97      0.98       200
  Portugeese       0.99      0.95      0.97       241
     Russian       0.83      1.00      0.91       217
     Spanish       0.94      0.97      0.95       269
    Sweedish       0.99      0.94      0.96       215
       Tamil       1.00      0.98      0.99       155
     Turkish       0.99      0.92      0.95       156

    accuracy              

In [55]:
print(accuracy_score(y_test, Predictions))

0.9671746776084408


### Task #7: Now, Test with random input language phrase - "Hi, How are you?"

In [56]:
# Arabic
lang_clf.predict(["مرحبا كيف حالك؟"])

array(['Arabic'], dtype=object)

In [57]:
# Danish
lang_clf.predict(["Hej hvordan går det?"])


array(['Sweedish'], dtype=object)

In [58]:
# Dutch
lang_clf.predict(["Hallo hoe gaat het?"])

array(['Dutch'], dtype=object)

In [59]:
# English
lang_clf.predict(["Hi, how are you?"])

array(['English'], dtype=object)

In [60]:
# French
lang_clf.predict(["Salut comment ça va?"])

array(['French'], dtype=object)

In [61]:
# German
lang_clf.predict(["Hi, wie geht es dir?"])

array(['German'], dtype=object)

In [62]:
# Greek
lang_clf.predict(["Γεια πως εισαι?"])

array(['Greek'], dtype=object)

In [63]:
# Hindi
lang_clf.predict(["नमस्ते, आप कैसे हैं?"])

array(['Hindi'], dtype=object)

In [64]:
# Italian
lang_clf.predict(["Ciao, come stai?"])

array(['Italian'], dtype=object)

In [65]:
# Kannada
lang_clf.predict(["ನಮಸ್ಕಾರ ಹೇಗಿದ್ದೀರಾ?"])

array(['Kannada'], dtype=object)

In [66]:
# Malayalam
lang_clf.predict(["ഹായ്, സുഖമാണോ?"])

array(['Russian'], dtype=object)

In [67]:
# Portugeese
lang_clf.predict(["Oi como você está?"])

array(['Portugeese'], dtype=object)

In [68]:
# Russian
lang_clf.predict(["Привет, как ты?"])

array(['Russian'], dtype=object)

In [69]:
# Spanish
lang_clf.predict(["¿Hola, cómo estás?"])

array(['Spanish'], dtype=object)

In [70]:
# Sweedish
lang_clf.predict(["Hej hur mår du?"])

array(['Sweedish'], dtype=object)

In [71]:
# Tamil
lang_clf.predict(["ஹாய், நீங்கள் எப்படி இருக்கிறீர்கள்?"])

array(['Tamil'], dtype=object)

In [72]:
# Turkish
lang_clf.predict(["Merhaba nasılsın?"])

array(['Russian'], dtype=object)

In [73]:
# Italian
lang_clf.predict(["ciao come stai?"])

array(['Italian'], dtype=object)