In [None]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [None]:
import pandas as pd 
import numpy as np

data = pd.read_csv('/drive/My Drive/Job Tasks/kemet/Language_det_train.csv')


# Exploring the data balance

In [None]:
data.groupby("Language").count()

Unnamed: 0_level_0,Text
Language,Unnamed: 1_level_1
Arabic,509
Danish,407
Dutch,519
English,1316
French,963
German,446
Greek,347
Hindi,60
Italian,663
Kannada,351


# Build Model

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pickle

In [None]:
# Prepare the data
sentences = data['Text'].to_list()  # List of sentences
languages = data['Language'].to_list()  # List of corresponding languages

# Tokenize the sentences
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)


In [None]:
# Saving tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:


# Prepare the data
sentences = data['Text'].to_list()  # List of sentences
languages = data['Language'].to_list()  # List of corresponding languages

# Tokenize the sentences
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Pad the sequences
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences)

# Convert language labels to one-hot vectors
labels = np.zeros((len(languages), len(set(languages))))
for i, lang in enumerate(languages):
    label_index = {
        'Greek': 0,
        'Italian': 1,
        'Turkish': 2,
        'Sweedish': 3,
        'Arabic': 4,
        'Portugeese': 5,
        'English': 6,
        'Tamil': 7,
        'German': 8,
        'Malayalam': 9,
        'Russian': 10,
        'Dutch': 11,
        'French': 12,
        'Kannada': 13,
        'Danish': 14,
        'Hindi': 15,
        'Spanish': 16
      }  

    # Update with your language labels
    labels[i, label_index[lang]] = 1

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)

# Define the model architecture
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 256))
model.add(LSTM(256))
model.add(Dense(len(set(languages)), activation='softmax'))

# Compile and train the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=64, epochs=5, validation_data=(X_test, y_test))

# Save the model
model.save('language_classification_model.h5')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save('language_classification_model.h5')


# Make predictions 

In [None]:
# Load the model
model = tf.keras.models.load_model('/drive/My Drive/Job Tasks/kemet/language_classification_model.h5')
with open('/drive/My Drive/Job Tasks/kemet/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
# Classify a sentence
def classify(sentence):
    # Preprocess the input sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence)

    # Perform the classification
    prediction = model.predict(sequence)
    label_index = {
              0: 'Greek',
              1: 'Italian',
              2: 'Turkish',
              3: 'Swedish',
              4: 'Arabic',
              5: 'Portuguese',
              6: 'English',
              7: 'Tamil',
              8: 'German',
              9: 'Malayalam',
              10: 'Russian',
              11: 'Dutch',
              12: 'French',
              13: 'Kannada',
              14: 'Danish',
              15: 'Hindi',
              16: 'Spanish'
              }
        
    # Update with your language labels
    predicted_label = label_index[np.argmax(prediction)]

    return predicted_label

# Example usage:
sentence = "اليوم مشرق"
predicted_language = classify(sentence)
print("Sentence:", sentence)
print("Predicted Language:", predicted_language)

Sentence: اليوم مشرق
Predicted Language: Arabic


# Model Evaluation

I used accuracy as kind of habit but since the data is inbalanced we can use wieghted accuracy which take in count the imbalance in the data to tell more accurate result about the behaviour of the model 
refer [here](https://medium.com/cuenex/advanced-evaluation-metrics-for-imbalanced-classification-models-ee6f248c90ca#:~:text=The%20first%20important%20metric%20for,class%20weights%20receive%20higher%20weightage.) and [here](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data).


In [None]:
model.evaluate(X_test, y_test)



[1.3832392692565918, 0.5845214128494263]