# Import libraries

In [None]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load dataset

In [None]:
df = pd.read_csv('Language Detection.csv')
df.head(10)

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
5,"[2] In ancient philosophy, natura is mostly us...",English
6,"[3][4] \nThe concept of nature as a whole, the...",English
7,During the advent of modern scientific method ...,English
8,"[5][6] With the Industrial revolution, nature ...",English
9,"However, a vitalist vision of nature, closer t...",English


# Data cleaning


In [None]:
df.drop_duplicates(subset=["Text"], inplace=True)

# tokenize words
df['Text'] = df['Text'].apply(word_tokenize)

# remove punctuation
df['Text'] = df['Text'].apply(lambda text: ' '.join([word for word in text if word.isalnum()]))

print(df['Text'])

0        Nature in the broadest sense is the natural ph...
1        Nature can refer to the phenomena of the physi...
2        The study of nature is a large if not the only...
3        Although humans are part of nature human activ...
4        1 The word nature is borrowed from the Old Fre...
                               ...                        
10332                                                    ಆ
10333                                                     
10334                                                 ಈಗ ಆ
10335                                               ಈಗ ess
10336                                                    ಆ
Name: Text, Length: 10267, dtype: object


# One Hot Encoding

In [None]:
bahasa = pd.get_dummies(df['Language'])
df_baru = pd.concat([df, bahasa], axis=1)
df_baru = df_baru.drop(columns=['Language'])
print(df_baru)

                                                    Text  Arabic  Danish  \
0      Nature in the broadest sense is the natural ph...       0       0   
1      Nature can refer to the phenomena of the physi...       0       0   
2      The study of nature is a large if not the only...       0       0   
3      Although humans are part of nature human activ...       0       0   
4      1 The word nature is borrowed from the Old Fre...       0       0   
...                                                  ...     ...     ...   
10332                                                  ಆ       0       0   
10333                                                          0       0   
10334                                               ಈಗ ಆ       0       0   
10335                                             ಈಗ ess       0       0   
10336                                                  ಆ       0       0   

       Dutch  English  French  German  Greek  Hindi  Italian  Kannada  \
0          0  

# Split dataset
in this dataset there is a column named text and 17 other columns with language names

In [None]:
kalimat = df_baru['Text'].values
label = df_baru[['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German', 'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
                 'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish']].values

# Split data into train and test

- random_state is used so that you will get the same result as i do

In [None]:
kalimat_latih, kalimat_test, label_latih, label_test = train_test_split(kalimat, label, test_size=0.2, random_state=42)

# Tokenization

In [None]:
vocab_size = 150000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='x')
tokenizer.fit_on_texts(kalimat_latih)

sekuens_latih = tokenizer.texts_to_sequences(kalimat_latih)
sekuens_test = tokenizer.texts_to_sequences(kalimat_test)

padded_latih = pad_sequences(sekuens_latih, maxlen=200, padding='post')
padded_test = pad_sequences(sekuens_test, maxlen=200, padding='post')

print(f"Kalimat latih: {kalimat_latih} \n")
print(f"Sekuens latih: {sekuens_latih} \n")
print(f"Padded latih: {padded_latih} \n")

Kalimat latih: ['102 يمكن للمحررين في إحدى موسوعات ويكيبيديا أن يترجموا المقالات من لغة إلى لغة وهذا يلقى في مجتمع ويكيبيديا انظر ويكيبيديا ترجمة مقالات للعربية'
 ''
 '23 24 25 The Earth atmosphere is a key factor in sustaining the ecosystem'
 ... 'cubrirme'
 'Due to its generality the field is studied in many other disciplines such as game theory control theory operations research information theory optimization systems swarm intelligence statistics and genetic algorithms'
 'Wikipedia è stata elogiata poiché come wiki permette alle voci di essere aggiornate o create in risposta ad avvenimenti di attualità'] 

Sekuens latih: [[4104, 777, 4105, 49, 12410, 7649, 146, 111, 12411, 872, 67, 3341, 211, 3341, 2363, 12412, 49, 5380, 146, 12413, 146, 12414, 1495, 12415], [], [625, 511, 677, 4, 420, 1635, 21, 3, 5381, 1841, 8, 12416, 4, 2774], [829, 237, 4106, 42, 72, 2364, 2071, 933, 72, 12, 280, 122, 58, 7650, 3342, 626, 678, 12417, 3, 12418], [45, 61, 563, 7651, 357, 12419], [169, 2365, 12420

# Callback function

In [None]:
class SantaiDuluGakSih(tf.keras.callbacks.Callback):
  def __init__(self, sabar=5):
    super(SantaiDuluGakSih, self).__init__()
    self.sabar = sabar
    self.gak_sabar = 0

  def on_epoch_end(self, epoch, logs={}):
    if logs.get('accuracy')<0.75 or logs.get('val_accuracy')<0.75:
      self.gak_sabar += 1
    else:
      self.gak_sabar = 0

    if self.gak_sabar >= self.sabar:
      print(f"The model accuracy has been below 75% for {self.gak_sabar} epochs, Stopping training immediatly!!!")
      self.model.stop_training = True

stop_early = SantaiDuluGakSih(sabar=5)

# Model creation

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=200, input_length=200),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(17, activation="softmax")
])

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
model.fit(
    padded_latih,
    label_latih,
    epochs=40,
    batch_size=64,
    validation_data=(padded_test, label_test),
    callbacks=[stop_early],
    verbose=2
)

Epoch 1/40
129/129 - 145s - loss: 2.7297 - accuracy: 0.1332 - val_loss: 2.7040 - val_accuracy: 0.1412 - 145s/epoch - 1s/step
Epoch 2/40
129/129 - 138s - loss: 2.7289 - accuracy: 0.1333 - val_loss: 2.6996 - val_accuracy: 0.1412 - 138s/epoch - 1s/step
Epoch 3/40
129/129 - 136s - loss: 2.7285 - accuracy: 0.1333 - val_loss: 2.7026 - val_accuracy: 0.1412 - 136s/epoch - 1s/step
Epoch 4/40


KeyboardInterrupt: ignored