**Data Acquisition in form of zip file through Kaggle**

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download -d ishantjuyal/language-detection-dataset

Downloading language-detection-dataset.zip to /content
 79% 8.00M/10.2M [00:01<00:00, 12.8MB/s]
100% 10.2M/10.2M [00:01<00:00, 9.34MB/s]


**Extracting the zip file**

In [3]:
from zipfile import ZipFile
dataset = '/content/language-detection-dataset.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('Extracted')

Extracted


In [32]:
import pandas as pd

In [87]:
import numpy as np

**Reading the CSV file**

In [33]:
table=pd.read_csv('languages.csv')

In [34]:
table.head()

Unnamed: 0,text,language
0,ich denke es handelt sich hier um ein missvers...,german
1,ich habe tom gerade erst verlassen,german
2,tom versuchte mary nur zu ärgern,german
3,tom hat mir die hand geküsst,german
4,ich wusste dass dir das gefiele,german


In [35]:
table.shape

(763684, 2)

Dataset contain only 4 Languages :
* English
* German
* French
* Spanish

In [36]:
table['language'].value_counts()

english    275687
german     199618
french     169693
spanish    118686
Name: language, dtype: int64

In [62]:
max([len(x) for x in table['text']])

251

Shufflling the dataset rows

In [38]:
table=table.sample(frac=1)

In [39]:
table.head()

Unnamed: 0,text,language
48705,bist du sicher dass das zu toms gunsten ist,german
510440,ive never been in trouble with the law,english
319814,on dirait que tu nas pas envie de te trouver là,french
644996,restart your computer,english
669891,the boy over there is bowing to you,english


In [40]:
table=table.head(100000)

In [41]:
table.head()

Unnamed: 0,text,language
48705,bist du sicher dass das zu toms gunsten ist,german
510440,ive never been in trouble with the law,english
319814,on dirait que tu nas pas envie de te trouver là,french
644996,restart your computer,english
669891,the boy over there is bowing to you,english


In [42]:
table['language'].value_counts()

english    36074
german     26095
french     22262
spanish    15569
Name: language, dtype: int64

In [43]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [44]:
tokenizer=Tokenizer()

Fitting text in tokenizer

In [45]:
tokenizer.fit_on_texts(table['text'])

In [51]:
len(tokenizer.word_index)

43093

Splitting language columns in individual languages columns

In [47]:
y=pd.get_dummies(table['language'])

Now transforming all the text into sequences

In [48]:
input_sequences=[]
for text in table['text']:
  sequences=tokenizer.texts_to_sequences([text])[0]
  input_sequences.append(sequences)

In [49]:
len(input_sequences)

100000

In [50]:
input_sequences

[[217, 19, 655, 57, 25, 32, 155, 21156, 18],
 [197, 181, 179, 13, 1088, 65, 5, 3555],
 [56, 3721, 9, 44, 1526, 17, 1581, 6, 79, 1076, 553],
 [14821, 72, 1269],
 [5, 950, 442, 120, 12, 21157, 4, 8],
 [155, 481, 105, 109, 2811, 82, 14],
 [7416, 18, 13, 3269],
 [217, 19, 2415],
 [10, 268, 4907, 68, 981, 514],
 [3018, 1247, 1285, 1944, 813],
 [609, 958, 6141, 26, 656],
 [1, 1502, 1163],
 [45, 1132, 847, 5260, 32, 118],
 [7417, 2, 11, 3931, 6, 1],
 [21, 1503, 50, 4611],
 [15, 18, 6716, 130, 373, 74, 188, 32, 355],
 [58, 657, 114, 1, 291],
 [53, 2470, 1945, 1614, 30, 211, 21158],
 [1, 269, 3137],
 [37, 4908, 3270],
 [74, 83, 36, 2161, 1740, 6717, 36, 4909, 2812, 908],
 [1, 770, 6718, 218, 4130],
 [41, 2, 222, 5663, 785, 3271, 443, 21, 3138, 50, 328, 2914, 11668],
 [1, 5261, 6, 11669, 2, 26, 940, 8383],
 [1, 54, 4131, 39, 3272],
 [105, 120, 295, 14822],
 [6, 464, 1303, 504, 340],
 [52, 403, 59, 21159, 4, 11670],
 [1, 1313, 108, 2, 5262, 510],
 [3, 42, 225, 254],
 [588, 79, 2006, 247],
 [48, 1

In [52]:
from tensorflow.keras.utils import pad_sequences

Now padding the sequences

In [63]:
padded=pad_sequences(input_sequences,maxlen=251,padding='pre')

In [64]:
padded

array([[    0,     0,     0, ...,   155, 21156,    18],
       [    0,     0,     0, ...,    65,     5,  3555],
       [    0,     0,     0, ...,    79,  1076,   553],
       ...,
       [    0,     0,     0, ...,    72,  1622, 10351],
       [    0,     0,     0, ...,     5, 21154,   510],
       [    0,     0,     0, ...,   172,   723,   956]], dtype=int32)

In [65]:
padded.shape

(100000, 251)

In [67]:
X=padded

In [88]:
y

Unnamed: 0,english,french,german,spanish
48705,0,0,1,0
510440,1,0,0,0
319814,0,1,0,0
644996,1,0,0,0
669891,1,0,0,0
...,...,...,...,...
404526,0,0,0,1
27102,0,0,1,0
587237,1,0,0,0
585583,1,0,0,0


In [68]:
from sklearn.model_selection import train_test_split

Splitting data into train and test data

In [69]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [70]:
X_train.shape,X_test.shape

((75000, 251), (25000, 251))

In [61]:
from tensorflow.keras import models, layers

Making a LSTM model

In [79]:
model=models.Sequential()
model.add(layers.Embedding(43094,100,input_length=251))
model.add(layers.LSTM(150))
model.add(layers.Dense(4,activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [80]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 251, 100)          4309400   
                                                                 
 lstm_2 (LSTM)               (None, 150)               150600    
                                                                 
 dense_2 (Dense)             (None, 4)                 604       
                                                                 
Total params: 4460604 (17.02 MB)
Trainable params: 4460604 (17.02 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Fitting the training data in our LSTM model

In [81]:
model.fit(X_train,y_train,epochs=2,validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7bb2181dfdc0>

Evaluating testing data using our model

In [82]:
model.evaluate(X_test,y_test)



[0.009768753312528133, 0.9963200092315674]

We got -
* Training data accuracy - 99.97%
* Validation data accuracy - 99.64%
* Testing data accuracy - 99.63%

Now testing the model with random texts

In [94]:
text="hey how are you"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.0000000e+00 8.0010114e-13 1.7990047e-13 1.9177344e-14]]


'english'

In [93]:
text="hey wie geht es dir"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[3.1988776e-11 6.2164236e-14 1.0000000e+00 5.0150724e-11]]


'german'

In [95]:
text="hola qué tal"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.4998490e-08 1.1339991e-07 2.6048286e-07 9.9999964e-01]]


'spanish'

In [96]:
text="hey comment allez-vous"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[6.8375878e-09 1.0000000e+00 8.1281065e-10 1.5190922e-09]]


'french'

In [97]:
text="ravi de vous rencontrer"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.1370515e-08 9.9999988e-01 5.2540017e-09 7.6381326e-08]]


'french'

In [98]:
text="nice to meet you"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.000000e+00 9.418105e-11 3.066401e-11 3.532886e-12]]


'english'

In [99]:
text="encantada de conocerte"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[0.000619   0.43994427 0.00083852 0.5585982 ]]


'spanish'

In [101]:
text="schön dich kennenzulernen"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[5.2113810e-06 2.5751194e-07 9.9998617e-01 8.3200302e-06]]


'german'