# Project #2 Part 3: Web Scraping for Yelp 

**Name:** Gabriel George

**Date:** 12/2/24

**Excercise:** Finding a website and scraping each page and plotting the data

**Purpose:** The objective of this assignment is to analyze the sentiment of customer reviews about coffee shops using the Yelp Fusion API, by fetching reviews, cleaning and processing the text, performing sentiment analysis and visualizing the results with a donut chart and WordCloud, and deriving actionable insights from the findings.


In [None]:
# Install the important libraries
!pip install yelpapi textblob wordcloud nltk matplotlib pandas python-dotenv

# Importing necessary libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import nltk
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()  # This will load environment variables from a .env file
api_key = os.getenv("YELP_API_KEY")
if not api_key:
    raise ValueError("API key not found. Please set it in the .env file.")

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')

# API Setup
headers = {"Authorization": f"Bearer {api_key}"}
url = "https://api.yelp.com/v3/businesses/search"
params = {
    "term": "coffee shop",
    "location": "San Francisco",
    "limit": 20
}

# Fetch the business data
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
businesses = response.json()["businesses"]

# Fetching the reviews for businesses
reviews = []

# Using a loop to fetch more business data
for business in businesses[:20]:
    business_id = business["id"]
    review_url = f"https://api.yelp.com/v3/businesses/{business_id}/reviews"
    review_response = requests.get(review_url, headers=headers)
    review_response.raise_for_status()
    reviews.extend([review["text"] for review in review_response.json()["reviews"]])

# Creating DataFrame
reviews_df = pd.DataFrame(reviews, columns=["Review"])

# Cleaning the Reviews
def clean_text(text):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(text)
    return " ".join(word for word in words if word.isalnum() and word.lower() not in stop_words)

reviews_df["Cleaned_Review"] = reviews_df["Review"].apply(clean_text)

# Sentiment Analysis
def analyze_sentiment(review, analyzer=TextBlob):
    if analyzer == NaiveBayesAnalyzer:
        return TextBlob(review, analyzer=NaiveBayesAnalyzer()).sentiment.classification
    blob = TextBlob(review)
    return "positive" if blob.sentiment.polarity > 0 else "negative" if blob.sentiment.polarity < 0 else "neutral"

reviews_df["TextBlob_Sentiment"] = reviews_df["Cleaned_Review"].apply(lambda x: analyze_sentiment(x, analyzer=TextBlob))
reviews_df["NaiveBayes_Sentiment"] = reviews_df["Cleaned_Review"].apply(lambda x: analyze_sentiment(x, analyzer=NaiveBayesAnalyzer))

# Sentiment Distribution
sentiment_counts = reviews_df["TextBlob_Sentiment"].value_counts()

# Donut Chart
plt.figure(figsize=(8, 8))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct="%1.1f%%", startangle=140, wedgeprops={"edgecolor": "black"})
plt.title("Sentiment Distribution using TextBlob")
plt.gca().add_artist(plt.Circle((0, 0), 0.7, color="white"))
plt.show()

# WordCloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(reviews_df["Cleaned_Review"]))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Top Words in Reviews", fontsize=16)
plt.show()




[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/gabrielgeorge/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
