# **Riyadh Guide Project - Sentiment Analysis**


## **Objective**
The primary objective of this notebook is to perform sentiment analysis on the collected reviews. By leveraging natural language processing (NLP) techniques and machine learning algorithms.

# Read Files

In [None]:
#import
import pandas as pd
import os
import re
from google.colab import drive
import re
import nltk
nltk.download('punkt')
from nltk.stem.isri import ISRIStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
pip install pyarrow pandas



In [None]:
# Mount Google Drive
drive.mount('/content/drive')
#Read the combined file
df = pd.read_excel('/content/drive/My Drive/All Reviews/AllData.xlsx')
# Select only the second and third columns
new_df = df.iloc[:, 1:3]
# Rename the columns
new_df.columns = ['Review', 'Class']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Check for missing values in the 'Review' column
print(new_df['Review'].isnull().sum())
# Check for missing values in the 'Review' column
missing_review_rows = new_df[new_df['Review'].isnull()]

# Print the rows with missing values in the 'Review' column
print(missing_review_rows)
# Drop rows with missing values in the 'Review' column
new_df = new_df.dropna(subset=['Review'])

# Reset the index after dropping rows
new_df = new_df.reset_index(drop=True)



1
     Review  Class
1062    NaN      1


In [None]:
# Assuming 'new_df' is your DataFrame with 'Class' column
class_counts = new_df['Class'].value_counts()

# Print the count of each class
print("Class Counts:")
print(class_counts)

Class Counts:
 1    10061
-1     1999
 0     1891
Name: Class, dtype: int64


# Text Cleaning including :

*   Removing duplicates
*   Deleting English words or letters
*   Removing tashkeel and harakat
*   Removing non text elemnnts.





In [None]:
# Remove duplicates
new_df.drop_duplicates(inplace=True)

# Function to delete English letters
def delete_english_letters(text):
    return re.sub(r'[a-zA-Z]', '', text)

# Delete English letters from the 'Review' column
new_df['Review'] = new_df['Review'].apply(delete_english_letters)

# Function to remove tashkeel and harakat
def remove_tashkeel_harakat(text):
    tashkeel_harakat_pattern = re.compile(r'[\u064B-\u0652]')
    return tashkeel_harakat_pattern.sub('', text)


#calling methods
# Remove tashkeel and harakat from the 'Review' column
new_df['Review'] = new_df['Review'].apply(remove_tashkeel_harakat)


# Define the emoji removal pattern
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

# Set display options to show the full text in the 'Review' column
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# Remove numbers from the 'Review' column
new_df['Review'] = new_df['Review'].apply(lambda x: re.sub(r'\d+', '', str(x)))

# Apply emoji removal to the 'Review' column
new_df['Review'] = new_df['Review'].apply(lambda x: emoji_pattern.sub(r'', str(x)))

# removing prefix - suffix and repeted characters

In [None]:

## removing prefix - suffix

def light_stem(text):
    words = text.split()
    result = list()
    stemmer = ISRIStemmer()
    for word in words:
        word = stemmer.norm(word, num=1)      # remove diacritics which representing Arabic short vowels
        word = stemmer.pre32(word)        # remove length three and length two prefixes in this order
        word = stemmer.suf32(word)        # remove length three and length two suffixes in this order
        word = stemmer.waw(word)          # remove connective ‘و’ if it precedes a word beginning with ‘و’
        word = stemmer.norm(word, num=2)  # normalize initial hamza to bare alif
        #word = stemmer.stem(word)

        result.append(word)
    return ' '.join(result)


new_df['Review'] = new_df['Review'].apply(light_stem)

## remove repeted characters
def remove_repeated_char(text):
    return re.sub(r'(.)\1+', r'\1\1', text)     # keep 2 repeat

new_df['Review'] = new_df['Review'].apply(remove_repeated_char)

print(new_df.head(30))


                                                                                                                                                                                                                                                                                                     Review  \
0                                                                                                                                                                              مول مرتب وجميل يوجد فيه غالب بضائع مارك شهيره عالميه .نظافه مول في جميع نواحي توزيع محل خدم يجعله من اجمل مول في هذه منطقه …   
1                                                                                                                                                                                                            من افخم مراكز تجارية في رياض وتحديدا في شارع تحلية ، يتوفر به ارقى دور ازياء مارك عالمية مجوهر   
2                                                                                          

# Tokenize

In [None]:
#Tokenize the Arabic reviews
new_df['Review'] = new_df['Review'].apply(nltk.word_tokenize)

# Remove Stop words and punctuations

In [None]:
import nltk
from nltk.corpus import stopwords
import re
import string
from nltk.tokenize import word_tokenize


nltk.download('stopwords')
stop_words = list(set(stopwords.words('arabic')))

arabic_punctuations = '''`÷×؛،<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def remove_stop_words_and_punctuations(tokens):
    filtered_tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    filtered_tokens = [re.sub(r'[^a-zA-Z0-9\u0600-\u06FF\s]', '', token) for token in tokens if token.strip()]
    return filtered_tokens

new_df['Review'] = new_df['Review'].apply(remove_stop_words_and_punctuations)

print(new_df['Review'].head(30))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0                                                                                                                                                                                                                      [مول, مرتب, وجميل, يوجد, فيه, غالب, بضائع, مارك, شهيره, عالميه, نظافه, مول, في, جميع, نواحي, توزيع, محل, خدم, يجعله, من, اجمل, مول, في, هذه, منطقه, ]
1                                                                                                                                                                                                                                                         [من, افخم, مراكز, تجارية, في, رياض, وتحديدا, في, شارع, تحلية, ،, يتوفر, به, ارقى, دور, ازياء, مارك, عالمية, مجوهر]
2                                                                                                                                                                                                                     [مول, يجمع, اغلب, مارك, عالمية, اسعار, غالية, بسبب, وجود

# Models

In [None]:
# Logistic regression model
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

# Assuming 'Review' and 'Class' are columns in your DataFrame
texts_train = new_df['Review']
labels_train = new_df['Class']

# Convert labels to numerical representation
label_encoder = LabelEncoder()
encoded_labels_train = label_encoder.fit_transform(labels_train)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts_train, encoded_labels_train, test_size=0.2, random_state=42)

# Convert the tokenized words back to strings
X_train_str = [' '.join(tokens) for tokens in X_train]
X_test_str = [' '.join(tokens) for tokens in X_test]

# Convert the text data into Bag-of-Words representation
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train_str)
X_test_bow = vectorizer.transform(X_test_str)



# Create logistic regression model
logistic_model = LogisticRegression()

# Define hyperparameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [100, 200, 300]}

# Create GridSearchCV
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Convert the text data into Bag-of-Words representation
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train_str)
X_test_bow = vectorizer.transform(X_test_str)

# Fit the grid search to the data
grid_search.fit(X_train_bow, y_train)

# Print the best hyperparameters found by grid search
print("Best Hyperparameters:", grid_search.best_params_)

# Use the best model found by grid search for predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_bow)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

# Evaluate the best model on the test set
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
#Logistic Regression model
# Print the evaluation metrics for the test set
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print('Confusion Matrix:')
print(conf_matrix)




Best Hyperparameters: {'C': 0.1, 'max_iter': 100}
Test Accuracy: 76.35%
Precision: 0.7225873252356477
Recall: 0.7634719710669078
F1-Score: 0.7280533970065804
Confusion Matrix:
[[ 144   26  204]
 [  35   58  271]
 [  65   53 1909]]


In [None]:
# Gradient Boosting model
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier

# Assuming 'Review' and 'Class' are columns in your DataFrame
texts_train = new_df['Review']
labels_train = new_df['Class']

# Convert labels to numerical representation
label_encoder = LabelEncoder()
encoded_labels_train = label_encoder.fit_transform(labels_train)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts_train, encoded_labels_train, test_size=0.2, random_state=42)

# Convert the tokenized words back to strings
X_train_str = [' '.join(tokens) for tokens in X_train]
X_test_str = [' '.join(tokens) for tokens in X_test]

# Convert the text data into Bag-of-Words representation
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train_str)
X_test_bow = vectorizer.transform(X_test_str)

# Train a Gradient Boosting model
gbm_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(X_train_bow, y_train)

# Make predictions on the training set
y_train_pred = gbm_model.predict(X_train_bow)

# Calculate the training accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)

# Print the training accuracy
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')

# Make predictions on the test set
y_pred = gbm_model.predict(X_test_bow)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

# Evaluate the model on the test set
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
# Gradient Boosting Model
# Print the evaluation metrics for the test set
print(f'Precision: {precision}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print('Confusion Matrix:')
print(conf_matrix)


Training Accuracy: 76.71%
Test Accuracy: 74.65%
Precision: 0.6830810492309568
Recall: 0.7464737793851718
F1-Score: 0.6782112438088996
Confusion Matrix:
[[  74   12  288]
 [  25   18  321]
 [  37   18 1972]]


In [None]:
# SVM
# Import necessary libraries
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# from sklearn.preprocessing import LabelEncoder
# from sklearn.svm import SVC

# # Assuming 'Review' and 'Class' are columns in your DataFrame
# texts_train = new_df['Review']
# labels_train = new_df['Class']

# # Convert labels to numerical representation
# label_encoder = LabelEncoder()
# encoded_labels_train = label_encoder.fit_transform(labels_train)

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(texts_train, encoded_labels_train, test_size=0.2, random_state=42)

# # Convert the tokenized words back to strings
# X_train_str = [' '.join(tokens) for tokens in X_train]
# X_test_str = [' '.join(tokens) for tokens in X_test]

# # Convert the text data into Bag-of-Words representation
# vectorizer = CountVectorizer()
# X_train_bow = vectorizer.fit_transform(X_train_str)
# X_test_bow = vectorizer.transform(X_test_str)

# # Train an SVM model
# svm_model = SVC(kernel='linear')  # You can choose a different kernel if needed (e.g., 'rbf' for radial basis function)
# svm_model.fit(X_train_bow, y_train)

# # Make predictions on the training set
# y_train_pred = svm_model.predict(X_train_bow)

# # Calculate the training accuracy
# train_accuracy = accuracy_score(y_train, y_train_pred)

# # Print the training accuracy
# print(f'Training Accuracy: {train_accuracy * 100:.2f}%')

# # Make predictions on the test set
# y_pred = svm_model.predict(X_test_bow)

# # Calculate the test accuracy
# test_accuracy = accuracy_score(y_test, y_pred)

# # Print the test accuracy
# print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

# # Evaluate the model on the test set
# precision = precision_score(y_test, y_pred, average='weighted')
# recall = recall_score(y_test, y_pred, average='weighted')
# f1 = f1_score(y_test, y_pred, average='weighted')
# conf_matrix = confusion_matrix(y_test, y_pred)

# # Print the evaluation metrics for the test set
# print(f'Precision: {precision}')
# print(f'Recall: {recall}')
# print(f'F1-Score: {f1}')
# print('Confusion Matrix:')
# print(conf_matrix)


In [None]:
# Load new data from a file into a DataFrame
new_data_path = '/content/drive/My Drive/All Reviews/google احاجي.xlsx'  # Change the path accordingly
new_data = pd.read_excel(new_data_path)

# Assume 'new_data' is a DataFrame containing the new reviews in the 'Review' column
# Drop rows with null or missing values in the 'Review' column
new_data.dropna(subset=['Review'], inplace=True)

# Preprocess the new data
new_data['Review'] = new_data['Review'].apply(delete_english_letters)
new_data['Review'] = new_data['Review'].apply(remove_tashkeel_harakat)
new_data['Review'] = new_data['Review'].apply(lambda x: re.sub(r'\d+', '', str(x)))
new_data['Review'] = new_data['Review'].apply(lambda x: emoji_pattern.sub(r'', str(x)))
new_data['Review'] = new_data['Review'].apply(light_stem)
new_data['Review'] = new_data['Review'].apply(remove_repeated_char)
new_data['Review'] = new_data['Review'].apply(nltk.word_tokenize)
new_data['Review'] = new_data['Review'].apply(remove_stop_words_and_punctuations)

# Convert the tokenized words back to strings
new_data_str = [' '.join(tokens) for tokens in new_data['Review']]

# Convert the text data into Bag-of-Words representation using the same vectorizer
new_data_bow = vectorizer.transform(new_data_str)

# Make predictions on the new data
new_predictions = best_model.predict(new_data_bow)

# Convert numerical labels back to original class names
new_predictions_classes = label_encoder.inverse_transform(new_predictions)
# Print each review with its corresponding predicted label
for i, (review, prediction) in enumerate(zip(new_data_str, new_predictions_classes)):
    print(f"Review {i+1}: {review} - Predicted Label: {prediction}")


Review 1: تجربة رائعة جدا  استمتع فيها جدا جرب غرفة العم صالح وكانت تجربة مليئة تحدي اثارة حماس، فيها الغاز متنوعة تحتاج مهار عديدة مثل تفكير سرعة وحل مشاكل حساب معرفة  - Predicted Label: 1
Review 2: تجربة ممتعة مرره تنظيم خدمة رائعات، وعطو بآخر شي قطعة من كنز ذكرى   عتبي على انهم سريع مره باعطائك تلميحات، واعتقد يرجع هذا لقصر وقت محدد،  - Predicted Label: 1
Review 3: تجربة العم صالح كانت جميلة جدا، فيه ترتيب في الغاز افكار وقت مناسب، تعامل موظف راقي وجدا متعاون، حصل كنز واخذ ذكرى منه، اسعار مرتفعة، عدا ذلك كل شي ممتاز، انصح مكان ولي زيارة اخرى ان شاء الله - Predicted Label: 1
Review 4: مكان رائع وتجربة ممتاز غرفة العم صالح جميلة جدا فزنا كنز - Predicted Label: 1
Review 5: تنظيم ممتاز تجربة كانت جدا ممتعة ما تحس وقت يمر ابدا   - Predicted Label: 1
Review 6: حمد لله زيارة كانت جممييللة جدا كان عدد وجرب غرفة العم صالح كانت ممتعه وحلي لغز على اخر ثانيه ، وكل ما صعب علي حل جزء وكان متواجد يساعدو اول باول ومركز معنا ما قصر وروح حلوة الله يسعد بإذن الله لنا تجربة اخررى مكان مرتب مره ومنظظم -

classification

In [None]:
import numpy as np
#new Code
# Load new data from a file into a DataFrame
new_data_path = '/content/drive/My Drive/All Reviews/check_now4.xlsx'  # Change the path accordingly
new_data = pd.read_excel(new_data_path)

# Assume 'new_data' is a DataFrame containing the new reviews in the 'Review' column
# Drop rows with null or missing values in the 'Review' column
new_data.dropna(subset=['Review'], inplace=True)

# Preprocess the new data
new_data['Review'] = new_data['Review'].apply(delete_english_letters)
new_data['Review'] = new_data['Review'].apply(remove_tashkeel_harakat)
new_data['Review'] = new_data['Review'].apply(lambda x: re.sub(r'\d+', '', str(x)))
new_data['Review'] = new_data['Review'].apply(lambda x: emoji_pattern.sub(r'', str(x)))
new_data['Review'] = new_data['Review'].apply(light_stem)
new_data['Review'] = new_data['Review'].apply(remove_repeated_char)
new_data['Review'] = new_data['Review'].apply(nltk.word_tokenize)
new_data['Review'] = new_data['Review'].apply(remove_stop_words_and_punctuations)
# Remove empty lists from the 'Review' column
new_data['Review'] = new_data['Review'].apply(lambda x: [token for token in x if token.strip()])

# Filter out rows where 'Review' is an empty list
new_data = new_data[new_data['Review'].apply(len) > 0]

# Convert the tokenized words back to strings
new_data_str = [' '.join(tokens) for tokens in new_data['Review']]

# Convert the text data into Bag-of-Words representation using the same vectorizer
new_data_bow = vectorizer.transform(new_data_str)

# Make predictions on the new data
new_predictions = best_model.predict(new_data_bow)

# Convert numerical labels back to original class names
new_predictions_classes = label_encoder.inverse_transform(new_predictions)
# Print each review with its corresponding predicted label
for i, (review, prediction) in enumerate(zip(new_data_str, new_predictions_classes)):
    print(f"Review {i+1}: {review} - Predicted Label: {prediction}")
# Add predicted labels as a new column in the new_data DataFrame
new_data['Predicted Label'] = new_predictions_classes



# Count the occurrences of each label in the predictions
label_counts = np.bincount(new_predictions, minlength=3)

# Print the count of each label
for label, count in enumerate(label_counts):
    print(f"Label {label-1}: Count = {count}")

# Find the label with the highest count
final_classification = max(range(len(label_counts)), key=lambda x: label_counts[x]) - 1

# Print the final classification based on the label with the highest count
print("Final Classification:", final_classification)


# Specify the directory where you want to save the file
directory = '/content/drive/My Drive/All Reviews'

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Save the DataFrame as a CSV file in the specified directory
file_path = os.path.join(directory, 'new_labeled_data_namg.csv')
df.to_csv(file_path, index=False)
print(f"New labeled data saved to: {file_path}")

# Print each review with its corresponding predicted label
for i, (review, prediction) in enumerate(zip(new_data_str, new_predictions_classes)):
    print(f"Review {i+1}: {review} - Predicted Label: {prediction}")


# Count the occurrences of each label in the predictions
label_counts = np.bincount(new_predictions, minlength=3)

# Print the count of each label
for label, count in enumerate(label_counts):
    print(f"Label {label-1}: Count = {count}")

# Find the label with the highest count
final_classification = max(range(len(label_counts)), key=lambda x: label_counts[x]) - 1


# Calculate the percentage of label 1
label_1_count = label_counts[2]
total_count = np.sum(label_counts)
label_1_percentage = (label_1_count / total_count) * 100



# Print the percentage of label 1
print(f"Percentage of how much people like the place: {label_1_percentage}%")


# Print the final classification based on the label with the highest count
print("Final Classification:", final_classification)

#مب مهم
# Specify the directory where you want to save the file
directory = '/content/drive/My Drive/All Reviews'


# Save the updated DataFrame to the original Excel file with the predicted labels
file_path = os.path.join(directory, 'check_now4.xlsx')
new_data.to_excel(file_path, index=False)
print(f"Updated file saved to: {file_path}")


Review 1: قدييم وخدمات بدائيه و مستوى نظافه متد و لاغيرو فيه شي من سنين، مثل ماهو، لا ديكور ولا خدم ولا تعامل و صاحبه مشغل غريبه ما تعرف تبتسم نهائ و جافه تعامل كانك ضيف ثقيل داخل بيتها، عند عامله وحده اسم واوا شاطره بفتله وجه اجي احيا علشانها، غير كذا ماانصح فيهم نهائ - Predicted Label: -1
Review 2: سئ بكل مقايس صح موظف مره ذوق بس شغل اسف وهذي امانه وتجربتي - Predicted Label: 1
Review 3: مشغل وصخ مابه نظافه ابدا وريحة اندومي وشيبس مكان غير مرتب يعني سويت بدكير ومونكير ب بصراحه حقه - Predicted Label: -1
Review 4: مشغل سيى وحرام اسمه مشغل هذا مزبله لانظافه ولااسلوب وتلاعب اسعار ومديره تحتاج مديره لااسلوب ولاذوق تقول مو عاجبك اطلعي قاعده تاكل فصفص ع استقبال وتتكلم مع زبا فلبنيه تغسل حمام بعد تجي تسوي اظافيري مدري وين بلديه عنهم وبااء منتشر - Predicted Label: -1
Review 5: فجاءه قررت مالكه ترفع اسعار ونست تطور مشغل عامل - Predicted Label: 1
Review 6: مشغل يحتاج له شغل من ناحيه عامل اخلاق سيئه كأن هم الي يدفع عنك و مره من عامل معصبه و بس تتأفف وترت و تصقع اشياء و تشد شعر بقوه لما تمشطط اخلا