# NLP Language Detection

### Project Prerequisites

In [1]:
import pandas as pd
import spacy
import numpy as np
import re
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn import pipeline
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.metrics import balanced_accuracy_score

### Preparing the Data

In [2]:
uniq_lang_df = pd.read_csv("Data/UniqueLang.csv")
hindi_df = pd.read_csv("Data/hindi.csv")

print(uniq_lang_df.head(2))
print(hindi_df.head(2))

                                                Text Language
0   Nature, in the broadest sense, is the natural...  English
1  "Nature" can refer to the phenomena of the phy...  English
                                                Text Language
0  चंद्रमोहन शर्मा को-प्रड्यूसर और लीड ऐक्टर अक्ष...    Hindi
1  अगर आप इस फिल्म को देखने जा रहे हैं तो सबसे पह...    Hindi


In [3]:
lang_df = pd.concat([uniq_lang_df,hindi_df],ignore_index=True)
lang_df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
lang_df.Text[1]

'"Nature" can refer to the phenomena of the physical world, and also to life in general.'

In [5]:
lang_df.query("Language != 'Russian'",inplace=True)

### Regex Processing


In [6]:
lang_df["ReText"] = lang_df["Text"].str.replace("""[,"'.\n:;*\/%$#@&]""", "",regex=True)
lang_df.head()

Unnamed: 0,Text,Language,ReText
0,"Nature, in the broadest sense, is the natural...",English,Nature in the broadest sense is the natural p...
1,"""Nature"" can refer to the phenomena of the phy...",English,Nature can refer to the phenomena of the physi...
2,"The study of nature is a large, if not the onl...",English,The study of nature is a large if not the only...
3,"Although humans are part of nature, human acti...",English,Although humans are part of nature human activ...
4,[1] The word nature is borrowed from the Old F...,English,[1] The word nature is borrowed from the Old F...


### Handling Class Imbalance


### Spliting the Data

In [7]:
X = lang_df["ReText"]
y = lang_df["Language"]

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2022)

In [9]:
clf = pipeline.Pipeline([
    ("vec",CountVectorizer()),
    ("lr",LogisticRegression(penalty="l2"))
])
clf.fit(X_train,y_train)

In [10]:
test_result = clf.predict(X_test)
print(classification_report(y_test,test_result))

              precision    recall  f1-score   support

      Arabic       1.00      0.94      0.97       109
      Danish       0.97      0.91      0.94        92
       Dutch       1.00      0.94      0.97       115
     English       0.99      0.98      0.99       281
      French       0.99      0.95      0.97       198
      German       0.98      0.97      0.97        91
       Greek       1.00      0.97      0.98        67
       Hindi       1.00      1.00      1.00       142
     Italian       0.99      0.94      0.96       140
     Kannada       1.00      0.96      0.98        79
   Malayalam       1.00      0.96      0.98       134
  Portugeese       1.00      0.95      0.98       147
     Spanish       0.93      0.97      0.95       167
    Sweedish       0.98      0.96      0.97       133
       Tamil       1.00      0.97      0.99        77
     Turkish       0.66      0.99      0.79       101

    accuracy                           0.96      2073
   macro avg       0.97   

In [11]:
test_sample = input("Test text: ")
processed_sample = [test_sample]

ans = clf.predict(processed_sample)

proba = clf.predict_proba(processed_sample)
proba_df = pd.DataFrame(proba,columns=clf.classes_)

proba_df_clear = proba_df.T.reset_index()
proba_df_clear.columns = ["Lanuages","Prediction Probability"]

print(f"The ans is: {ans}")
proba_df_clear

Test text: സുഖമാണോ?
The ans is: ['Malayalam']


Unnamed: 0,Lanuages,Prediction Probability
0,Arabic,0.070586
1,Danish,0.044714
2,Dutch,0.045024
3,English,0.063711
4,French,0.050366
5,German,0.050902
6,Greek,0.054427
7,Hindi,0.011258
8,Italian,0.060326
9,Kannada,0.081869
