# Sentiment Analysis on Amazon Reviews

## Objective

## Data Description

## Methodology

### Imports

Run the following command in your terminal or command prompt to install all necessary libraries:

```bash
pip install pandas seaborn matplotlib numpy scikit-learn nltk textblob wordcloud

In [19]:
#All the imports are mentioned here:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk

# Data cleaning tools
import re
import string

# Removing special characters
import unicodedata

# Removing stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Calculating Polarity and Subjectivity
from textblob import TextBlob

# N-grams
from nltk.util import ngrams

# for Wordscloud
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer

# Load modules
import nltk
import collections

### Step 1: Load & Inspect Data

In [20]:
balanced_data = pd.read_csv('Datasets/balanced_reviews.csv')

In [None]:
balanced_data.head()

In [None]:
balanced_data.tail()

In [None]:
balanced_data.info()

In [None]:
balanced_data.describe()

In [None]:
balanced_data.columns

In [None]:
balanced_data['Text']

### Step 2: Data Cleaning

#### Drop Unnecesary Columns

In [None]:
# List of columns to drop
columns_to_drop = ['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 
                   'HelpfulnessDenominator', 'Time', 'Summary', 'length']

# Drop the specified columns
balanced_data = balanced_data.drop(columns=columns_to_drop)

# Display the updated DataFrame
balanced_data.head()

In [28]:
# First lets remove Punctuations from the Reviews
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

balanced_data['Text'] = balanced_data['Text'].apply(punctuation_removal)

In [None]:
import re

# Function to remove numbers from reviews
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search(r'\d', i):  # Use raw string for regex
            list_text_new.append(i)
    return ''.join(list_text_new)

# Apply the function to the "Text" column
balanced_data['Text'] = balanced_data['Text'].apply(drop_numbers)


In [None]:
balanced_data['Text'].head(10)

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='Score', data=balanced_data , palette='viridis')
plt.title('Number of Entries per Score')
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()

In [None]:
# Define a function to classify scores
def classify_score(score):
    if score in [4, 5]:
        return 'Positive'
    elif score == 3:
        return 'Neutral'
    elif score in [1, 2]:
        return 'Negative'

# Apply the function to create a new column
balanced_data['Sentiment'] = balanced_data['Score'].apply(classify_score)

# Display the updated DataFrame
print(balanced_data.head())

In [None]:
balanced_data.shape

In [43]:
balanced_data['Text'] = balanced_data['Text'].str.lower()

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
balanced_data['Text'] = balanced_data['Text'].apply(
    lambda x: ' '.join(word for word in x.split() if word not in stop_words)
)

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')
balanced_data['Text'] = balanced_data['Text'].apply(word_tokenize)

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
balanced_data['Text'] = balanced_data['Text'].apply(
    lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split())
)