**Data Acquisition in form of zip file through Kaggle**

In [226]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [227]:
!kaggle datasets download -d ishantjuyal/language-detection-dataset

language-detection-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


**Extracting the zip file**

In [228]:
from zipfile import ZipFile
dataset = '/content/language-detection-dataset.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('Extracted')

Extracted


In [229]:
import pandas as pd

In [230]:
import numpy as np

**Reading the CSV file**

In [231]:
table=pd.read_csv('languages.csv')

In [232]:
table.head()

Unnamed: 0,text,language
0,ich denke es handelt sich hier um ein missvers...,german
1,ich habe tom gerade erst verlassen,german
2,tom versuchte mary nur zu ärgern,german
3,tom hat mir die hand geküsst,german
4,ich wusste dass dir das gefiele,german


In [233]:
table.shape

(763684, 2)

Dataset contain only 4 Languages :
* English
* German
* French
* Spanish

In [234]:
table['language'].value_counts()

english    275687
german     199618
french     169693
spanish    118686
Name: language, dtype: int64

In [235]:
max([len(x) for x in table['text']])

527

**Shufflling the dataset rows**

In [236]:
table=table.sample(frac=1)

In [237]:
table.head()

Unnamed: 0,text,language
124154,leg das nicht dorthin es könnte jemand darüber...,german
137950,man sollte nicht über seine freunde hinter der...,german
650514,tom and i sat on the beach watching the seagulls,english
281206,elle fait suivre à son chien un régime sans vi...,french
222523,je nage beaucoup,french


In [238]:
table=table.head(100000)

In [239]:
table.head()

Unnamed: 0,text,language
124154,leg das nicht dorthin es könnte jemand darüber...,german
137950,man sollte nicht über seine freunde hinter der...,german
650514,tom and i sat on the beach watching the seagulls,english
281206,elle fait suivre à son chien un régime sans vi...,french
222523,je nage beaucoup,french


In [240]:
table['language'].value_counts()

english    35966
german     26348
french     22222
spanish    15464
Name: language, dtype: int64

In [241]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [242]:
tokenizer=Tokenizer()

**Fitting text in tokenizer**

In [243]:
tokenizer.fit_on_texts(table['text'])

In [244]:
len(tokenizer.word_index)

42643

**Splitting language columns in individual languages columns**

In [245]:
y=pd.get_dummies(table['language'])

**Now transforming all the text into sequences**

In [246]:
input_sequences=[]
for text in table['text']:
  sequences=tokenizer.texts_to_sequences([text])[0]
  input_sequences.append(sequences)

In [247]:
len(input_sequences)

100000

In [248]:
input_sequences

[[2702, 25, 16, 1685, 14, 682, 829, 959, 21146],
 [183, 835, 16, 316, 286, 1141, 1402, 14747, 4153, 949],
 [1, 63, 3, 1293, 52, 5, 1891, 1850, 5, 21147],
 [104, 174, 3939, 29, 133, 836, 26, 8411, 786, 4371],
 [10, 7420, 366],
 [9697, 53, 6706, 186, 21148, 27, 3273],
 [7, 714, 1426, 19, 56, 2622, 907],
 [53, 9698, 33, 4154, 33, 3744, 639],
 [69, 19, 311, 289, 4372, 3745, 31, 960, 93, 272, 31, 229],
 [1, 21149, 61, 2796, 63, 1333, 36],
 [1, 210, 28, 377, 4, 197, 198],
 [218, 21, 15, 21150],
 [30, 78, 740, 17, 6, 3006, 178, 1223, 6, 53, 9, 81, 174],
 [143, 33, 2, 8412],
 [1762, 6126, 11, 3007, 6, 21, 646],
 [2089, 2195, 6, 21151, 86, 6707, 21152],
 [1, 468, 9699, 63, 546, 4, 2090, 61, 1334],
 [1, 55, 89, 14748, 2, 34],
 [1, 186, 721, 55, 14749, 222, 29, 5688],
 [7, 80, 56, 481, 1403],
 [152, 2901, 6127, 1646, 445, 9700, 11, 2797],
 [1729, 57, 921, 4, 1562, 39, 359, 180, 8413],
 [749, 9701, 131, 14750],
 [46, 6128, 11602, 19, 181, 1169],
 [7, 213, 54, 1, 931, 255],
 [28, 2623, 4373, 66, 46

In [249]:
from tensorflow.keras.utils import pad_sequences

**Now padding the sequences**

In [250]:
padded=pad_sequences(input_sequences,maxlen=251,padding='pre')

In [251]:
padded

array([[    0,     0,     0, ...,   829,   959, 21146],
       [    0,     0,     0, ..., 14747,  4153,   949],
       [    0,     0,     0, ...,  1850,     5, 21147],
       ...,
       [    0,     0,     0, ...,   527,    78,   965],
       [    0,     0,     0, ...,    20,    25,  3586],
       [    0,     0,     0, ...,  5561,    29,  4794]], dtype=int32)

In [252]:
padded.shape

(100000, 251)

In [253]:
X=padded

In [254]:
y

Unnamed: 0,english,french,german,spanish
124154,0,0,1,0
137950,0,0,1,0
650514,1,0,0,0
281206,0,1,0,0
222523,0,1,0,0
...,...,...,...,...
139022,0,0,1,0
78691,0,0,1,0
266327,0,1,0,0
27480,0,0,1,0


In [255]:
from sklearn.model_selection import train_test_split

**Splitting data into train and test data**

In [256]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [257]:
X_train.shape,X_test.shape

((75000, 251), (25000, 251))

In [258]:
from tensorflow.keras import models, layers

**Making a LSTM model**

In [259]:
model=models.Sequential()
model.add(layers.Embedding(43094,100,input_length=251))
model.add(layers.LSTM(150))
model.add(layers.Dense(4,activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [260]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 251, 100)          4309400   
                                                                 
 lstm_6 (LSTM)               (None, 150)               150600    
                                                                 
 dense_6 (Dense)             (None, 4)                 604       
                                                                 
Total params: 4460604 (17.02 MB)
Trainable params: 4460604 (17.02 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


**Fitting the training data in our LSTM model**

In [261]:
model.fit(X_train,y_train,epochs=2,validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7a81f8c194b0>

**Evaluating testing data using our model**

In [262]:
loss,accuracy=model.evaluate(X_test,y_test)



**Accuracy of test data**

In [263]:
accuracy

0.9970399737358093

We got -
* Training data accuracy - 99.91%
* Validation data accuracy - 99.71%
* Testing data accuracy - 99.70%

**Now exporting the model and tokenizer**

In [288]:
import pickle

In [289]:
pickle.dump(model,open('language_model.pkl','wb'))

In [290]:
pickle.dump(tokenizer,open('lang_tokenizer.pkl','wb'))

**Now testing the model with random texts**

In [267]:
text="hey how are you"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[9.9999678e-01 2.5505651e-06 6.0629731e-07 5.7190430e-08]]


'english'

In [268]:
text="hey wie geht es dir"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.3201854e-06 3.3819281e-06 9.9999464e-01 7.5240257e-07]]


'german'

In [269]:
text="hola qué tal"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.9204606e-06 3.5829056e-05 2.8779301e-05 9.9993348e-01]]


'spanish'

In [270]:
text="hey comment allez-vous"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.7768296e-07 9.9999940e-01 1.6991339e-07 2.7625757e-07]]


'french'

In [271]:
text="ravi de vous rencontrer"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.3298119e-07 9.9998784e-01 5.1802072e-08 1.2004288e-05]]


'french'

In [272]:
text="nice to meet you"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[9.9999762e-01 1.9223814e-06 4.0654353e-07 4.3590120e-08]]


'english'

In [273]:
text="encantada de conocerte"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[4.1921023e-05 4.6747345e-02 1.5970314e-04 9.5305097e-01]]


'spanish'

In [274]:
text="schön dich kennenzulernen"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[2.6040114e-05 7.0389688e-05 9.9986601e-01 3.7585905e-05]]


'german'

In [275]:
text="hey how are you how do you do"
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[9.9999666e-01 2.3733533e-06 8.8930250e-07 8.0276180e-08]]


'english'

In [277]:
import string
def punc(text):
  for i in string.punctuation:
    text=text.replace(i,'')
  for i in string.digits:
    text=text.replace(i,'')
  return text

In [278]:
text="Hey, wie geht es dir? Wie geht es dir?"
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.4538543e-06 4.2120851e-06 9.9999368e-01 5.6194523e-07]]


'german'

In [280]:
text="Hola, ¿cómo estás? ¿Cómo estás?"
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[2.7777179e-07 8.9629111e-06 9.6152971e-06 9.9998105e-01]]


'spanish'

In [281]:
text="hé, comment vas-tu, comment vas-tu"
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[2.1192496e-07 9.9999940e-01 4.1639698e-08 3.7227309e-07]]


'french'

In [282]:
text="Can you tell me your name?"
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[9.9998820e-01 8.8556144e-06 2.5583290e-06 3.6654814e-07]]


'english'

In [283]:
text="Peux-tu me dire ton nom?"
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.1843028e-06 9.9999833e-01 4.0251912e-08 4.7682337e-07]]


'french'

In [284]:
text="With a comprehensive collection of 785 million records, this dataset provides an unparalleled wealth of translated text."
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[9.9989808e-01 5.5887038e-05 4.1897154e-05 4.0594778e-06]]


'english'

In [285]:
text="Avec une collection complète de 785 millions d’enregistrements, cet ensemble de données fournit une richesse inégalée de textes traduits."
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[7.9566933e-05 9.9896145e-01 4.9788578e-05 9.0925553e-04]]


'french'

In [286]:
text="Con una colección completa de 785 millones de registros, este conjunto de datos proporciona una riqueza incomparable de texto traducido."
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[7.4473263e-08 1.6221222e-05 5.5342667e-08 9.9998367e-01]]


'spanish'

In [287]:
text="Mit einer umfassenden Sammlung von 785 Millionen Datensätzen bietet dieser Datensatz eine beispiellose Fülle an übersetzten Texten."
text=text.lower()
text=punc(text)
sequence=tokenizer.texts_to_sequences([text])[0]
pad=pad_sequences([sequence],maxlen=251,padding='pre')
pred=model.predict(pad)
print(pred)
pred=np.argmax(pred)
y.columns[pred]

[[1.1456838e-05 6.3385296e-06 9.9998164e-01 6.2859914e-07]]


'german'