In [1]:
!pip install requests
!pip install beautifulsoup4
!pip install selenium
!pip install nltk
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install wordcloud




In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import random
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from wordcloud import WordCloud

# Initialize Chrome WebDriver with headless option
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)

# Initialize VADER Sentiment Analyzer
vader = SentimentIntensityAnalyzer()

# Function to scrape Amazon reviews
def scrape_amazon_reviews(product_urls):
    reviews = []
    for url in product_urls:
        driver.get(url)
        for _ in range(5):
            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        review_elements = soup.find_all('span', {'data-hook': 'review-body'})
        for review in review_elements:
            reviews.append(review.get_text(strip=True))
    return reviews

# Function to classify sentiment using VADER
def classify_sentiment(comment):
    scores = vader.polarity_scores(comment)
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Function to save comments with sentiments to TSV file
def save_to_tsv_file(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write("Sentiment\tReview\n")
        for comment in data:
            sentiment = classify_sentiment(comment)
            file.write("%s\t%s\n" % (sentiment, comment))

# Function to get input of Amazon product URLs
def get_amazon_product_urls():
    urls = []
    print("Enter up to 10 Amazon product URLs (press Enter after each URL, type 'done' when finished):")
    for i in range(10):
        url = input(f"Enter URL {i+1}: ")
        if url.strip().lower() == 'done':
            break
        urls.append(url.strip())
    return urls

# Example usage
amazon_product_urls = get_amazon_product_urls()
amazon_reviews = scrape_amazon_reviews(amazon_product_urls)
save_to_tsv_file(amazon_reviews, 'amazon_reviews.tsv')

# Load reviews from TSV file into DataFrame
df = pd.read_csv('amazon_reviews.tsv', sep='\t')

# Drop NaN values and remove empty reviews
df.dropna(inplace=True)
empty_objects = []
for index, label, review in df.itertuples():
    if type(review)==str:
        if review.isspace():
            empty_objects.append(i)
df.drop(empty_objects, inplace=True)

# Calculate sentiment scores for each review
df['scores'] = df['Review'].apply(lambda review: vader.polarity_scores(review))
df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])

# Calculate average compound score
average_compound = df['compound'].mean()

# Count positive, negative, and neutral reviews
compound_greater_than_zero = df[df['compound'] > 0]['compound']
compound_less_than_zero = df[df['compound'] < 0]['compound']
compound_equal_to_zero = df[df['compound'] == 0]['compound']

# Print statistics
print("Average sentiment score:", average_compound)
print("Total number of positive reviews:", compound_greater_than_zero.count())
print("Total number of negative reviews:", compound_less_than_zero.count())
print("Total number of neutral reviews:", compound_equal_to_zero.count())

# Generate WordCloud from all reviews
stop_words = set(stopwords.words('english'))
def filter_words(words):
    tagged_words = pos_tag(words)
    filtered_words = [word for word, tag in tagged_words if tag.startswith('N') or tag.startswith('J') or tag.startswith('V')]
    return filtered_words

df['filtered_comment'] = df['Review'].apply(lambda review: filter_words(word_tokenize(review)))
all_words = [word for words in df['filtered_comment'] for word in words]
all_text = ' '.join(all_words)

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of All Reviews')
plt.show()

# Generate WordCloud from negative reviews
filtered_negative_comments = negative_comments.apply(lambda review: filter_words(word_tokenize(review)))
all_negative_words = [word for words in filtered_negative_comments for word in words]
all_negative_text = ' '.join(all_negative_words)

wordcloud_negative = WordCloud(width=800, height=400, background_color='white').generate(all_negative_text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Negative Reviews')
plt.show()





Enter up to 10 Amazon product URLs (press Enter after each URL, type 'done' when finished):
