**Data Acquisition in form of zip file through Kaggle**

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download -d ishantjuyal/language-detection-dataset

Downloading language-detection-dataset.zip to /content
 69% 7.00M/10.2M [00:00<00:00, 13.4MB/s]
100% 10.2M/10.2M [00:00<00:00, 12.5MB/s]


**Extracting the zip file**

In [3]:
from zipfile import ZipFile
dataset = '/content/language-detection-dataset.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('Extracted')

Extracted


In [4]:
import pandas as pd

In [5]:
import numpy as np

**Reading the CSV file**

In [6]:
table=pd.read_csv('languages.csv')

In [7]:
table.head()

Unnamed: 0,text,language
0,ich denke es handelt sich hier um ein missvers...,german
1,ich habe tom gerade erst verlassen,german
2,tom versuchte mary nur zu ärgern,german
3,tom hat mir die hand geküsst,german
4,ich wusste dass dir das gefiele,german


In [8]:
table.shape

(763684, 2)

Dataset contain only 4 Languages :
* English
* German
* French
* Spanish

In [9]:
table['language'].value_counts()

english    275687
german     199618
french     169693
spanish    118686
Name: language, dtype: int64

In [10]:
max([len(x) for x in table['text']])

527

**Shufflling the dataset rows**

In [11]:
table=table.sample(frac=1)

In [12]:
table.head()

Unnamed: 0,text,language
457756,por favor ayúdeme,spanish
651922,i go home right after work,english
523264,im toms aunt,english
354856,cest le village où jai passé mon enfance,french
396819,la leche es una bebida popular,spanish


In [13]:
table=table.head(100000)

In [14]:
table.head()

Unnamed: 0,text,language
457756,por favor ayúdeme,spanish
651922,i go home right after work,english
523264,im toms aunt,english
354856,cest le village où jai passé mon enfance,french
396819,la leche es una bebida popular,spanish


In [15]:
table['language'].value_counts()

english    36068
german     26237
french     22231
spanish    15464
Name: language, dtype: int64

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [17]:
tokenizer=Tokenizer()

**Fitting text in tokenizer**

In [18]:
tokenizer.fit_on_texts(table['text'])

In [19]:
len(tokenizer.word_index)

42750

**Splitting language columns in individual languages columns**

In [20]:
y=pd.get_dummies(table['language'])

**Now transforming all the text into sequences**

In [21]:
input_sequences=[]
for text in table['text']:
  sequences=tokenizer.texts_to_sequences([text])[0]
  input_sequences.append(sequences)

In [22]:
len(input_sequences)

100000

In [23]:
input_sequences

[[92, 448, 21293],
 [3, 109, 248, 348, 441, 269],
 [45, 150, 4625],
 [110, 22, 2710, 317, 77, 852, 181, 6768],
 [11, 2341, 14, 102, 14858, 2159],
 [1, 19, 21294, 266, 404, 833, 215, 28, 12, 187],
 [1, 2808, 87, 15],
 [941, 224, 1921, 111, 5, 597, 19, 313],
 [274, 376, 58, 2, 324],
 [21, 123, 306, 844],
 [1, 18, 251, 4626, 147, 20],
 [1, 82, 875, 147, 94, 90, 1685, 21295],
 [1, 171, 106, 1720, 152, 13, 1068],
 [10, 256, 373, 7485, 6, 157, 1102, 3568],
 [10, 521, 1872, 25, 1792, 172, 957, 162, 6, 367, 927, 51, 986],
 [2915, 12, 46, 1140, 2013],
 [7, 67, 285, 32, 303],
 [48, 8, 106, 52, 212, 87, 1922, 325, 1, 2, 604],
 [297, 21296, 367, 14859, 2916],
 [115, 259, 2, 11688, 148, 51, 377, 2711],
 [10,
  174,
  647,
  9,
  34,
  8423,
  9,
  10,
  30,
  1686,
  17,
  26,
  3569,
  6,
  670,
  29,
  334,
  7486],
 [164, 149, 3259, 82, 314, 14860, 6769],
 [3, 197, 677],
 [383, 134, 28, 218, 487, 4, 5, 319, 21297],
 [1873, 12, 46, 3745],
 [63, 272, 8424, 19, 63, 112],
 [3, 42, 2, 5299, 31, 21298

In [24]:
from tensorflow.keras.utils import pad_sequences

**Now padding the sequences**

In [25]:
padded=pad_sequences(input_sequences,maxlen=251,padding='pre')

In [26]:
padded

array([[    0,     0,     0, ...,    92,   448, 21293],
       [    0,     0,     0, ...,   348,   441,   269],
       [    0,     0,     0, ...,    45,   150,  4625],
       ...,
       [    0,     0,     0, ..., 14561,    11,  4224],
       [    0,     0,     0, ...,   888,     4,   546],
       [    0,     0,     0, ...,    11,  2662,  1412]], dtype=int32)

In [27]:
padded.shape

(100000, 251)

In [28]:
X=padded

In [29]:
y

Unnamed: 0,english,french,german,spanish
457756,0,0,0,1
651922,1,0,0,0
523264,1,0,0,0
354856,0,1,0,0
396819,0,0,0,1
...,...,...,...,...
756366,1,0,0,0
421632,0,0,0,1
417754,0,0,0,1
570576,1,0,0,0


In [30]:
from sklearn.model_selection import train_test_split

**Splitting data into train and test data**

In [31]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [32]:
X_train.shape,X_test.shape

((75000, 251), (25000, 251))

In [33]:
from tensorflow.keras import models, layers

**Making a LSTM model**

In [34]:
model=models.Sequential()
model.add(layers.Embedding(43094,100,input_length=251))
model.add(layers.LSTM(150))
model.add(layers.Dense(4,activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 251, 100)          4309400   
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 4)                 604       
                                                                 
Total params: 4460604 (17.02 MB)
Trainable params: 4460604 (17.02 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Fitting the training data in our LSTM model**

In [36]:
model.fit(X_train,y_train,epochs=2,validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7ce6a50a3eb0>

**Evaluating testing data using our model**

In [49]:
loss,accuracy=model.evaluate(X_test,y_test)



**Accuracy of test data**

In [50]:
accuracy

0.9965999722480774

We got -
* Training data accuracy - 99.97%
* Validation data accuracy - 99.63%
* Testing data accuracy - 99.65%

**Now exporting the model and tokenizer**

In [46]:
import pickle

In [47]:
pickle.dump(model,open('language_model.pkl','wb'))

In [48]:
pickle.dump(tokenizer,open('lang_tokenizer.pkl','wb'))

**Now testing the model with random texts**

In [38]:
text="hey how are you"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.0000000e+00 4.5087944e-10 1.7999489e-09 2.0819265e-11]]


'english'

In [39]:
text="hey wie geht es dir"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.3627700e-07 2.1866445e-07 9.9999928e-01 3.9904080e-07]]


'german'

In [40]:
text="hola qué tal"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[4.9582006e-08 2.1591487e-07 4.1147774e-07 9.9999940e-01]]


'spanish'

In [41]:
text="hey comment allez-vous"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.5122242e-07 9.9999940e-01 3.7872346e-07 1.1265216e-07]]


'french'

In [42]:
text="ravi de vous rencontrer"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.8963352e-07 9.9999690e-01 5.9182287e-07 2.2399236e-06]]


'french'

In [43]:
text="nice to meet you"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.0000000e+00 2.9004510e-09 6.2102172e-09 1.2937737e-10]]


'english'

In [44]:
text="encantada de conocerte"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[2.3195335e-04 7.5105749e-02 1.2954951e-03 9.2336690e-01]]


'spanish'

In [45]:
text="schön dich kennenzulernen"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.6583881e-05 2.1212945e-05 9.9993098e-01 3.1266918e-05]]


'german'