IMPORTING DATSETS AND MODULES

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import keras
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
data_lang = pd.read_csv('lang.csv')
data_sentiment = pd.read_csv('train.txt', sep=';')
data_sentiment.columns = ['Text', 'Sentiment']

DATA ARRAGEMENT AND STRUCTURE

In [2]:
data_lang['language'] = data_lang['language'].str.lower()
data_lang.head()

Unnamed: 0.1,Unnamed: 0,Text,language
0,0,klement gottwaldi surnukeha palsameeriti ning ...,estonian
1,1,sebes joseph pereira thomas på eng the jesuit...,swedish
2,2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,thai
3,3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,tamil
4,4,de spons behoort tot het geslacht haliclona en...,dutch


In [3]:
data_sentiment.columns = ['Text' , 'Sentiment']
data_sentiment.head()

Unnamed: 0,Text,Sentiment
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger
2,i am ever feeling nostalgic about the fireplac...,love
3,i am feeling grouchy,anger
4,ive been feeling a little burdened lately wasn...,sadness


NULL VALUE COUNTING

In [4]:
data_lang.isnull().sum()

Unnamed: 0    0
Text          0
language      0
dtype: int64

In [5]:
data_sentiment.isnull().sum()

Text         0
Sentiment    0
dtype: int64

VALUE COUNTS

In [6]:
data_lang['language'].value_counts()

language
estonian      1000
swedish       1000
english       1000
russian       1000
romanian      1000
persian       1000
pushto        1000
spanish       1000
hindi         1000
korean        1000
chinese       1000
french        1000
portugese     1000
indonesian    1000
urdu          1000
latin         1000
turkish       1000
japanese      1000
dutch         1000
tamil         1000
thai          1000
arabic        1000
Name: count, dtype: int64

In [7]:
data_sentiment['Sentiment'].value_counts()

Sentiment
joy         5362
sadness     4665
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

DATA DISTRIBUTION

In [8]:
y_lang = data_lang['language']
x_lang = data_lang['Text']
texts = data_sentiment["Text"].tolist()
labels = data_sentiment["Sentiment"].tolist()

TOKENIZATION AND FEATURE EXTRACTION

In [9]:
cv = CountVectorizer()
X = cv.fit_transform(x_lang)

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)
print(max_length)

66


CATEGORICAL ENCODING

In [11]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

one_hot_labels = keras.utils.to_categorical(labels)

DATASET SPLITTING

In [12]:
x_train , x_test , y_train , y_test = train_test_split(X , y_lang , train_size = 0.75 , random_state = 42)


In [13]:
xtrain, xtest, ytrain, ytest = train_test_split(padded_sequences,
                                                one_hot_labels,
                                                test_size=0.2)


MODEL BUILDING 

In [14]:
model_lang = MultinomialNB()



In [19]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout

model_sequence = Sequential()

model_sequence.add(Embedding(
    input_dim=len(tokenizer.word_index) + 1,
    output_dim=128,
    input_length=max_length
))

model_sequence.add(Conv1D(filters=128, kernel_size=5, activation="relu"))
model_sequence.add(GlobalMaxPooling1D())

model_sequence.add(Dense(128, activation="relu"))
model_sequence.add(Dropout(0.5))

model_sequence.add(Dense(len(one_hot_labels[0]), activation="softmax"))

model_sequence.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)




MODEL TRAINING AND TESTING

In [20]:
model_sequence.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model_sequence.fit(xtrain, ytrain, epochs=10, batch_size=32, validation_data=(xtest, ytest))

Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 46ms/step - accuracy: 0.3972 - loss: 1.4724 - val_accuracy: 0.8916 - val_loss: 0.3371
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 44ms/step - accuracy: 0.9151 - loss: 0.2565 - val_accuracy: 0.9237 - val_loss: 0.1908
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 46ms/step - accuracy: 0.9577 - loss: 0.1216 - val_accuracy: 0.9169 - val_loss: 0.2141
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 55ms/step - accuracy: 0.9772 - loss: 0.0703 - val_accuracy: 0.9219 - val_loss: 0.2351
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 57ms/step - accuracy: 0.9866 - loss: 0.0409 - val_accuracy: 0.9162 - val_loss: 0.2636
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 48ms/step - accuracy: 0.9902 - loss: 0.0327 - val_accuracy: 0.9144 - val_loss: 0.3021
Epoch 7/10
[1m4

<keras.src.callbacks.history.History at 0x1ff8f7472f0>

In [21]:
model_lang.fit(x_train , y_train)

In [22]:

loss, accuracy = model_sequence.evaluate(xtest, ytest)
print(f"Test Accuracy: {accuracy}")


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9136 - loss: 0.4176
Test Accuracy: 0.917187511920929


In [27]:
import numpy as np
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score
)

y_pred_probs = model_sequence.predict(xtest)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(ytest, axis=1)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [28]:
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.95      0.93       456
           1       0.87      0.88      0.88       379
           2       0.91      0.97      0.94      1065
           3       0.92      0.73      0.81       270
           4       0.96      0.94      0.95       911
           5       0.79      0.75      0.77       119

    accuracy                           0.92      3200
   macro avg       0.89      0.87      0.88      3200
weighted avg       0.92      0.92      0.92      3200



In [29]:
print(confusion_matrix(y_true, y_pred))


[[ 433    5    8    0    9    1]
 [   8  334    8    1   13   15]
 [   6    2 1030   13    8    6]
 [   1    1   67  197    2    2]
 [  25   16   15    3  852    0]
 [   3   24    3    0    0   89]]


In [26]:
model_sequence.save('Sequence Model.h5')



In [23]:
model_lang.score(x_test,y_test)

0.9527272727272728

CLASSIFYING USER INPUT

In [24]:
import numpy as np
from deep_translator import GoogleTranslator
from tensorflow.keras.preprocessing.sequence import pad_sequences

def translate_to_english(text, target_language):
    try:
        translated_text = GoogleTranslator(source=target_language, target='en').translate(text)
    except Exception as e:
        print("Translation failed:", e)
        translated_text = text
    return translated_text

user_input = input("Enter a Text:")
data = cv.transform([user_input]).toarray()
pred_lang = model_lang.predict(data)
print("Detected Language:", pred_lang)

if user_input:
    detected_lang = pred_lang[0]
    
    if detected_lang is not None:  
        if detected_lang != 'en':
            modified_input = translate_to_english(user_input, detected_lang)
            print("Modified English Text:", modified_input)
        else:
            modified_input = user_input
    else:
        print("Language detection failed. Assuming English.")
        modified_input = user_input
    data = cv.transform([modified_input]).toarray()
    input_sequence = tokenizer.texts_to_sequences([modified_input])
    padded_input_sequence = pad_sequences(input_sequence, maxlen=max_length)
    prediction = model_sequence.predict(padded_input_sequence)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction[0])])
    print("Predicted Sentiment:", predicted_label)


Enter a Text: he is a good boy


Detected Language: ['english']
Modified English Text: he is a good boy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 870ms/step
Predicted Sentiment: ['joy']


In [44]:
!pip install deep_translator

Collecting deep_translator
  Using cached deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Using cached deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4


In [46]:
joblib.dump(tokenizer , 'tokenizer.pkl')
joblib.dump(label_encoder,'label_encoder.pkl')
joblib.dump(cv , 'vectorizer.pkl')

['vectorizer.pkl']

In [47]:
import joblib
joblib.dump(model_sequence , 'Sequence_model')
joblib.dump(model_lang , 'Language_model')

['Language_model']