In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

In [2]:
import tensorflow as tf
from tensorflow import keras


In [3]:
from nltk.corpus import stopwords

In [4]:
df = pd.read_csv("dataset.csv")

In [5]:
df.shape

(22000, 2)

In [6]:
df.isnull().sum()

Text        0
language    0
dtype: int64

In [7]:
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [8]:
all_languages=list(df["language"].value_counts().index)

In [9]:
stopwords.words("dutch")

['de',
 'en',
 'van',
 'ik',
 'te',
 'dat',
 'die',
 'in',
 'een',
 'hij',
 'het',
 'niet',
 'zijn',
 'is',
 'was',
 'op',
 'aan',
 'met',
 'als',
 'voor',
 'had',
 'er',
 'maar',
 'om',
 'hem',
 'dan',
 'zou',
 'of',
 'wat',
 'mijn',
 'men',
 'dit',
 'zo',
 'door',
 'over',
 'ze',
 'zich',
 'bij',
 'ook',
 'tot',
 'je',
 'mij',
 'uit',
 'der',
 'daar',
 'haar',
 'naar',
 'heb',
 'hoe',
 'heeft',
 'hebben',
 'deze',
 'u',
 'want',
 'nog',
 'zal',
 'me',
 'zij',
 'nu',
 'ge',
 'geen',
 'omdat',
 'iets',
 'worden',
 'toch',
 'al',
 'waren',
 'veel',
 'meer',
 'doen',
 'toen',
 'moet',
 'ben',
 'zonder',
 'kan',
 'hun',
 'dus',
 'alles',
 'onder',
 'ja',
 'eens',
 'hier',
 'wie',
 'werd',
 'altijd',
 'doch',
 'wordt',
 'wezen',
 'kunnen',
 'ons',
 'zelf',
 'tegen',
 'na',
 'reeds',
 'wil',
 'kon',
 'niets',
 'uw',
 'iemand',
 'geweest',
 'andere']

In [10]:
stop_words=[]



for i in all_languages:
    try:
        stop_words+=stopwords.words(i.lower())
    except:
        pass
        

In [11]:
len(stop_words)

3777

In [12]:
import string

def conversion(text):
    text=text.lower()
    text=text.strip().split(" ")
    text=[i for i in text if i not in stop_words and i not in string.punctuation]
    
    
    return " ".join(text)

In [13]:
df["Text"]=df["Text"].apply(conversion)

In [14]:
df["Text"]

0        klement gottwaldi surnukeha palsameeriti ning ...
1        sebes joseph pereira thomas eng jesuits sino-r...
2        ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...
3        விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...
4        spons behoort geslacht haliclona behoort famil...
                               ...                        
21995    hors terrain années années crise championnat t...
21996    ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเด...
21997    motivo celebración septuagésimoquinto ° aniver...
21998    年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby like》，由美國...
21999    aprilie sonda spațială messenger nasa și-a înc...
Name: Text, Length: 22000, dtype: object

In [15]:
x=df["Text"].values
y=df["language"].values

In [16]:
y

array(['Estonian', 'Swedish', 'Thai', ..., 'Spanish', 'Chinese',
       'Romanian'], dtype=object)

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train , y_test=train_test_split(x, y, test_size=0.2, random_state=0)

TOKENIZER

In [18]:
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer(oov_token="nothing")
tokenizer.fit_on_texts(x_train)
# tokenizer.word_index

In [19]:
l=len(tokenizer.word_index)+1

In [20]:
sequences=tokenizer.texts_to_sequences(x_train)
from keras.utils import pad_sequences
sequences=pad_sequences(sequences, padding="post", maxlen=12)

In [21]:
sequences

array([[ 79251,      0,      0, ...,      0,      0,      0],
       [ 79252,   1274,  25266, ...,  79255,  79256,  79257],
       [    79,  79262,  79263, ...,   3894,   3894,  79265],
       ...,
       [232760,   1897,   6132, ...,  76577,  76578,  19496],
       [  1706,    828,  42241, ...,  10366,   1543, 232769],
       [    31,    570,     27, ...,     31,    103,     36]])

MODEL BUILDING

In [22]:
model= keras.models.Sequential([
    keras.layers.Embedding(l,10, input_length=12),
    keras.layers.LSTM(1000,input_shape=(12,l,), return_sequences=False),
    keras.layers.Dense(250 ,activation="relu"),
    keras.layers.Dense(22, activation="softmax")
])

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 12, 10)            2327700   
                                                                 
 lstm (LSTM)                 (None, 1000)              4044000   
                                                                 
 dense (Dense)               (None, 250)               250250    
                                                                 
 dense_1 (Dense)             (None, 22)                5522      
                                                                 
Total params: 6627472 (25.28 MB)
Trainable params: 6627472 (25.28 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [25]:
model.fit(x_train , y_train, epochs=10)

Epoch 1/10


ValueError: in user code:

    File "C:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1332, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1316, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1297, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1072, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\DELL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\input_spec.py", line 235, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential' (type Sequential).
    
    Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (32, 10)
    
    Call arguments received by layer 'sequential' (type Sequential):
      • inputs=tf.Tensor(shape=(32,), dtype=string)
      • training=True
      • mask=None
