## **Library Preparation**

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

## **Load Data**

In [2]:
df = pd.read_csv('amazon.csv')

## **Exploratory Data Analysis (EDA)**

In [3]:
print(df.head())

   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1         ₹349                 43%  

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

In [5]:
print(df.describe())

        product_id                                       product_name  \
count         1465                                               1465   
unique        1351                                               1337   
top     B07JW9H4J1  Fire-Boltt Ninja Call Pro Plus 1.83" Smart Wat...   
freq             3                                                  5   

                                                 category discounted_price  \
count                                                1465             1465   
unique                                                211              550   
top     Computers&Accessories|Accessories&Peripherals|...             ₹199   
freq                                                  233               53   

       actual_price discount_percentage rating rating_count  \
count          1465                1465   1465         1463   
unique          449                  92     28         1143   
top            ₹999                 50%    4.1        

## **Pre-Processing Data**

In [6]:
# Eliminate unnecessary columns
df = df.drop(['product_id', 'user_id', 'product_name'], axis=1)

In [7]:
# Convert rating to sentiment (positive, negative, neutral)

# Convert 'rating' column to numeric
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

df['sentiment'] = df['rating'].apply(lambda rating: 'positif'
if rating > 3 else ('negatif' if rating < 3 else 'netral'))

In [8]:
# Delete duplicate data
df.drop_duplicates(inplace=True)

In [9]:
# Delete missing values (if any)
df.dropna(inplace=True)

In [10]:
# Text Preprocessing
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

In [11]:
import re # Import the 're' module for regular expressions

def clean_text(text):
  text = text.lower() # Change to lowercase
  # Remove punctuation and special characters
  text = re.sub(r'[^\w\s]', '', text, re.UNICODE)
  # Remove common words
  text = [word for word in text.split() if word not in stop_words]
  text = ' '.join(text) # Merge back into a string
  return text

df['review_title'] = df['review_title'].apply(clean_text)

## **Feature Extraction**

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review_title'])
y = df['sentiment']

## **Data Splitting**

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

## **Modelling & Evaluating**

In [14]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi: {accuracy:.2f}%')
print(classification_report(y_test, y_pred))

Akurasi: 1.00%
              precision    recall  f1-score   support

     negatif       0.00      0.00      0.00         1
     positif       1.00      1.00      1.00       292

    accuracy                           1.00       293
   macro avg       0.50      0.50      0.50       293
weighted avg       0.99      1.00      0.99       293



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## **Model Usage Example**

In [15]:
new_review = "This book is amazing! I loved every page."
cleaned_review = clean_text(new_review)
vectorized_review = vectorizer.transform([cleaned_review])
prediction = model.predict(vectorized_review)
print(f'Prediksi sentimen: {prediction[0]}')

Prediksi sentimen: positif
