In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [44]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/Language Detection.csv')

In [45]:
df.head(10)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
5,"[2] In ancient philosophy, natura is mostly us...",English
6,"[3][4] \nThe concept of nature as a whole, the...",English
7,During the advent of modern scientific method ...,English
8,"[5][6] With the Industrial revolution, nature ...",English
9,"However, a vitalist vision of nature, closer t...",English


In [46]:
def remove_whitespace(text):
  return " ".join(text.split())

def remove_url(text):
  return re.sub(r'http|https|www|xxx\S+', '', text)

def remove_special_char(text):
  return re.sub(r"[^a-zA-Z0-9]"," ",text)
  
def valid_character_filtering(text):
  return "".join(filter(lambda char: char in string.printable, text))

def remove_long_number(text):
  result = re.sub(r'\d+', '', text)
  return result

def remove_punctuation(text):
  result=re.sub(r"[^\w\s]"," ",text)
  return result

In [47]:
df['Text'] = df['Text'].apply(lambda x:x.lower())
df['Text'] = df['Text'].apply(remove_whitespace)
df['Text'] = df['Text'].apply(remove_url)
df['Text'] = df['Text'].apply(remove_special_char)
df['Text'] = df['Text'].apply(valid_character_filtering)
df['Text'] = df['Text'].apply(remove_long_number)
df['Text'] = df['Text'].apply(remove_punctuation)

In [48]:
df['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [49]:
x = df["Text"]
y = df["Language"]

In [50]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [51]:
X = []
for text in x:
        text = text.lower()
        X.append(text)

In [53]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(X)
training_padded = pad_sequences(training_sequences, maxlen=100, padding='post', truncating='post')

In [54]:
training_padded = np.array(training_padded)
training_labels = np.array(y)

In [94]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(10000,16,input_length=100))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(25,return_sequences=True,activation='tanh')))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(25,return_sequences=True,activation='tanh')))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(50,activation='relu'))
model.add(tf.keras.layers.Dense(25,activation='relu'))
model.add(tf.keras.layers.Dense(17,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 100, 16)           160000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 50)           6450      
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 50)           11550     
_________________________________________________________________
global_average_pooling1d_8 ( (None, 50)                0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 50)                0         
_________________________________________________________________
dense_38 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_39 (Dense)             (None, 25)              

In [103]:
history = model.fit(training_padded, training_labels, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [110]:
def predict(text):
     sequences = tokenizer.texts_to_sequences([text])
     padded = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')
     res = model.predict(padded)
     res = np.argmax(res)
     lang = le.inverse_transform([res])
     print("The langauge is in",lang[0])

predict(" It was an awesome experience")
predict('Bien, merci. Et vous-même ?')
predict('Hola, me llamo Juan')

The langauge is in English
The langauge is in French
The langauge is in Spanish
