In [1]:
#import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import ISRIStemmer
from sklearn import preprocessing 
from tensorflow import keras
from tensorflow.keras.models import Sequential
#import libraries
from tensorflow.keras.layers import InputLayer,Dense, Bidirectional, LSTM, Dropout, Activation, Embedding, Conv1D, GlobalMaxPooling1D,SimpleRNN
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 


#Functions To Preprocess Dataset
def clean_reviews(text):
    #remove_special_chars
    pattern = re.compile(r'[^\w\s\u0600-\u06FF]+', re.UNICODE)
    text = re.sub(pattern, '', text)
    
    #remove_num
    text = re.sub(r'\d+', '', text)

    #remove_punc
    text = re.sub(r'[^\w\s_]', '', text)
    
    #remove_non_arabic
    pattern = re.compile(r'[^\u0600-\u06FF\s]+', re.UNICODE)
    text = re.sub(pattern, '', text)

    #remove_repeating_char
    text= re.sub(r'(.)\1+', r'\1', text)

    
    #remove_underscore
    text=text.replace("_", "")
    #remove_stopwords
    stop_words = set(stopwords.words('arabic'))
    words = word_tokenize(text)
    text = [word for word in words if word.lower() not in stop_words]
    text_after_remove_stop_words=' '.join(text)
    
    #stemming
    stemmer = ISRIStemmer()
    words = word_tokenize(text_after_remove_stop_words)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)
     

# Read Dataset

In [2]:
# read train_dataset
train_dataset = pd.read_excel('/kaggle/input/nn-competition-files/train.xlsx') 
# clean the data
reviews = train_dataset['review_description'].apply(clean_reviews)
ratings = train_dataset['rating']

max_fatures = 100# our model will remeber last 100 words
tokenizer = Tokenizer (num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(reviews)
pad_train = tokenizer.texts_to_sequences (reviews)
pad_train= pad_sequences (pad_train) # padding to make all sentence at same length
# encode ratings 
train_rating = ratings + 1 # Add 1 to convert -1->0 , 1->2 , 0->1 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(pad_train, train_rating, test_size=0.3,stratify=train_rating)

In [5]:
#read test_dataset
test_dataset = pd.read_csv('/kaggle/input/nn-competition-files/test _no_label.csv') 
# clean the data
cleaned_reviews = test_dataset['review_description'].apply(clean_reviews)
pad_test = tokenizer.texts_to_sequences (cleaned_reviews)
pad_test= pad_sequences (pad_test, maxlen=len(pad_train[0]))  # padding to make all sentence at same length

# CNN Model , Accuarcy = 0.75892

In [23]:
# CNN Model 
max_features = 100  #our model will remember the last 100 words
embed_dim = 4
model_cnn = Sequential()
model_cnn.add(Embedding(max_features, embed_dim, input_length=len(pad_train[0]), trainable=True))
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(3, activation='softmax'))

#compile the model
opt = keras.optimizers.Adam(learning_rate=0.01)
model_cnn.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model_cnn.summary()
#train the model
model_cnn.fit(X_train, y_train, epochs=10, batch_size=32)

#evaluate the model
loss, accuracy = model_cnn.evaluate(X_test, y_test)
print('Model CNN loss = ', loss)
print('Model CNN accuracy = ', accuracy)

#predict ratings using CNN 
predicted_ratings_cnn = model_cnn.predict(pad_test)
y_new_pred_cnn = np.argmax(predicted_ratings_cnn, axis=1)
y_new_pred_cnn = y_new_pred_cnn - 1

#create a new data frame with the cleaned reviews and predicted ratings
submission_csv_cnn = pd.DataFrame({'ID': range(1, 1001),'Predicted_Ratings': y_new_pred_cnn})
#save the data frame to a CSV file
submission_csv_cnn.to_csv('cnn.csv', index=False)  # Update with the desired filename and path


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 92, 4)             400       
                                                                 
 conv1d (Conv1D)             (None, 88, 128)           2688      
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense_27 (Dense)            (None, 3)                 387       
                                                                 
Total params: 3475 (13.57 KB)
Trainable params: 3475 (13.57 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


# RNN Model , Accuarcy = 0.75

In [25]:
embed_dim = 4
model_rnn = Sequential()
model_rnn.add(Embedding(max_features, embed_dim, input_length=len(pad_train[0]), trainable=True))
model_rnn.add(SimpleRNN(10, trainable=True))
model_rnn.add(Dense(3, activation='softmax'))
#compile the model
opt = keras.optimizers.Adam(learning_rate=0.01)
model_rnn.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model_rnn.summary()
#train the model
model_rnn.fit(X_train, y_train, epochs=10, batch_size=32)

#evaluate the model
loss, accuracy = model_rnn.evaluate(X_test, y_test)
print('Model RNN loss = ', loss)
print('Model RNN accuracy = ', accuracy)

#predict ratings using RNN 
predicted_ratings_rnn = model_rnn.predict(pad_test)

y_new_pred_rnn = np.argmax(predicted_ratings_rnn, axis=1)
y_new_pred_rnn = y_new_pred_rnn - 1

#create a new data frame with the cleaned reviews and predicted ratings
submission_csv_rnn = pd.DataFrame({'ID': range(1, 1001),
                                   'Predicted_Ratings': y_new_pred_rnn})

#save the data frame to a CSV file
submission_csv_rnn.to_csv('rnn.csv', index=False)  # Update with the desired filename and path


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 92, 4)             400       
                                                                 
 simple_rnn (SimpleRNN)      (None, 10)                150       
                                                                 
 dense_30 (Dense)            (None, 3)                 33        
                                                                 
Total params: 583 (2.28 KB)
Trainable params: 583 (2.28 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model RNN loss =  0.5719732046127319
Model RNN accuracy =  0.7707834839820862


# Lstm Model , Accuarcy = 0.75595

In [26]:
embed_dim =4
max_fatures = 100
model1 = Sequential()
model1.add(Embedding(max_fatures, embed_dim, input_length = len(pad_train[0]),trainable=True))
model1.add(LSTM(10,trainable=True))
model1.add(Dense(3, activation='softmax'))
# Compile the model
opt = keras.optimizers.Adam(learning_rate=0.01)
model1.compile(loss='sparse_categorical_crossentropy',optimizer=opt,metrics=['accuracy'])
model1.summary()
model1.fit(X_train,y_train, epochs=10, batch_size=32)
loss,accuracy = model1.evaluate(X_test,y_test)
print('model1 loss = ',loss)
print('model1 accurcy = ',accuracy)

predicted_ratings_model1=model1.predict(pad_test)
# Convert predictions to class labels (-1, 0, 1)
import numpy as np
y_new_pred_original=[]
y_new_pred_original = np.argmax(predicted_ratings_model1, axis=1)
y_new_pred_original=y_new_pred_original-1
# Create a new DataFrame with the cleaned reviews and predicted ratings
submtion_csv = pd.DataFrame({'ID':  range(1, 1001),
                          'Predicted_Ratings': y_new_pred_original})

# Save the DataFrame to a new CSV file
submtion_csv.to_csv('/kaggle/working/sub_lstm.csv', index=False)  # Update with the desired filename and path

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 92, 4)             400       
                                                                 
 lstm (LSTM)                 (None, 10)                600       
                                                                 
 dense_31 (Dense)            (None, 3)                 33        
                                                                 
Total params: 1033 (4.04 KB)
Trainable params: 1033 (4.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
model1 loss =  0.5447487235069275
model1 accurcy =  0.784517765045166


# GRU Model

In [8]:
from tensorflow.keras.layers import GRU
embed_dim = 4
max_features=100
model_gru = Sequential()
model_gru.add(Embedding(max_features, embed_dim, input_length=len(pad_train[0]), trainable=True))
model_gru.add(GRU(128, activation='relu'))
model_gru.add(Dense(3, activation='softmax'))
# Compile the model
opt = keras.optimizers.Adam(learning_rate=0.01)
model_gru.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
# Train the model
model_gru.fit(X_train, y_train, epochs=10, batch_size=32)
# Evaluate the model
loss_gru, accuracy_gru = model_gru.evaluate(X_test, y_test)
print('Model GRU loss = ', loss_gru)
print('Model GRU accuracy = ', accuracy_gru)
# Predict ratings using GRU
predicted_ratings_gru = model_gru.predict(pad_test)

y_new_pred_gru = np.argmax(predicted_ratings_gru, axis=1)
y_new_pred_gru = y_new_pred_gru - 1

# Create a new data frame with the cleaned reviews and predicted ratings for GRU
submission_csv_gru = pd.DataFrame({'ID': range(1, 1001),
                                   'Predicted_Ratings': y_new_pred_gru})


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model GRU loss =  0.5518734455108643
Model GRU accuracy =  0.7779627442359924
