# Arabic reviews analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [2]:
# Load data
data = pd.read_csv('CompanyReviews.csv',)

data

Unnamed: 0.1,Unnamed: 0,review_description,rating,company
0,0,رائع,1,talbat
1,1,برنامج رائع جدا يساعد على تلبيه الاحتياجات بشك...,1,talbat
2,2,التطبيق لا يغتح دائما بيعطيني لا يوجد اتصال با...,-1,talbat
3,3,لماذا لا يمكننا طلب من ماكدونالدز؟,-1,talbat
4,4,البرنامج بيظهر كل المطاعم و مغلقه مع انها بتكو...,-1,talbat
...,...,...,...,...
40041,128,تجربه جيده بس ينقصها عدم اهتمام خدمة العملاء ب...,0,swvl
40042,129,انا ساكنة بمنطقة الكينج ولا توجد عربيات قبل ال...,-1,swvl
40043,130,جيد ولكن لماذا لا توجد خطوط كثيره من المريوطيه...,0,swvl
40044,131,جيدا جدا ...ولكن الاسعار عاليه جدا\n,0,swvl


## Data cleaning & preprocessing

In [3]:
# Drop columns not needed
data.drop(['Unnamed: 0', 'company'], axis=1, inplace=True)

# Check for null values
print(data.isnull().sum())

# Drop null values
data.dropna(inplace=True)

review_description    1
rating                0
dtype: int64


In [4]:
# Convert text data to numerical features
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(data['review_description'])

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, data['rating'], test_size=0.2, random_state=42)

## Build Naive Bayes classifier to predict the rating of the review

In [5]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

## Evaluate the model

In [6]:
predictions = nb.predict(x_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 82.69%


In [7]:
# Test custom review
custom_review = 'المنتج كان جيد'
custom_review_vector = vectorizer.transform([custom_review])
print(nb.predict(custom_review_vector))

[1]


## Build SVM classifier to predict the rating of the review.

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, data['rating'], test_size=0.2, random_state=42)

svm = SVC()
svm.fit(x_train, y_train)

In [9]:
# Predictions
predictions = svm.predict(x_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 81.51%
