# Web Scraping

### Project 1

In [125]:
import requests
from bs4 import BeautifulSoup
from csv import writer
from time import sleep
from random import choice
import pandas as pd

all_quotes = []

base_url = "http://quotes.toscrape.com/"

url = "/page/1"

while url:

    res = requests.get(f"{base_url}{url}")
    print(f"Now Scraping{base_url}{url}")
    soup = BeautifulSoup(res.text, "html.parser")

    quotes = soup.find_all(class_="quote")

    for quote in quotes:
        all_quotes.append({
            "text": quote.find(class_="text").get_text(),
            "author": quote.find(class_="author").get_text(),
            "bio-link": quote.find("a")["href"]
        })
    next_btn = soup.find(_class="next")
    url = next_btn.find("a")["href"] if next_btn else None
    sleep(2)

df = pd.DataFrame(all_quotes)

df.head()

Now Scrapinghttp://quotes.toscrape.com//page/1


Unnamed: 0,text,author,bio-link
0,“The world as we have created it is a process ...,Albert Einstein,/author/Albert-Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,/author/J-K-Rowling
2,“There are only two ways to live your life. On...,Albert Einstein,/author/Albert-Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,/author/Jane-Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,/author/Marilyn-Monroe


### Project 2

In [139]:
import requests
from bs4 import BeautifulSoup

url_to_scrap = "https://en.wikipedia.org/wiki/Science"

res = requests.get(url_to_scrap)

soup = BeautifulSoup(res.content, 'html.parser')

table_of_content = soup.find(id="mw-panel-toc-list")
body = soup.find(id="bodyContent")

all_toc_href = []
for a in table_of_content.find_all("a", attrs={"href": True}):
  all_toc_href.append({"link":f'{url_to_scrap}{a.get("href")}', "text":a.get_text()})

all_content_href = []
for a in body.find_all("a", attrs={"href": True}):
  all_content_href.append({"link":f'{url_to_scrap}{a.get("href")}', "text":a.get_text()})


len(all_toc_href), len(all_content_href)

df_toc = pd.DataFrame(all_toc_href)
df_body = pd.DataFrame(all_content_href)

df_toc.head()

Unnamed: 0,link,text
0,https://en.wikipedia.org/wiki/Science#,\n(Top)\n
1,https://en.wikipedia.org/wiki/Science#Etymology,\n\n1\nEtymology\n\n
2,https://en.wikipedia.org/wiki/Science#History,\n\n2\nHistory\n\n
3,https://en.wikipedia.org/wiki/Science#Early_hi...,\n\n2.1\nEarly history\n\n
4,https://en.wikipedia.org/wiki/Science#Classica...,\n\n2.2\nClassical antiquity\n\n


In [140]:
df_body.head()

Unnamed: 0,link,text
0,https://en.wikipedia.org/wiki/Science/wiki/Wik...,
1,https://en.wikipedia.org/wiki/Science/wiki/Sci...,Science (journal)
2,https://en.wikipedia.org/wiki/Science/wiki/Out...,Outline of science
3,https://en.wikipedia.org/wiki/Science/wiki/Sci...,Science (disambiguation)
4,https://en.wikipedia.org/wiki/Science/wiki/Cat...,a series


### Project 3

In [175]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.imdb.com/chart/top/"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

res = requests.get(url, headers=headers)

print(res.status_code)

soup = BeautifulSoup(res.content, 'html.parser')

list_content = soup.find_all("li", class_="ipc-metadata-list-summary-item")

list_movies = []

for block in list_content:
    media_block = block.find("img", attrs={"src": True})
    image = media_block.get("src") if media_block else None

    title_link = block.find("a", class_="ipc-title-link-wrapper", attrs={"href": True})
    movie_link = title_link.get("href") if title_link else None

    title_tag = block.find("h3")
    title = title_tag.get_text(strip=True) if title_tag else None

    rating_tag = block.find("span", class_="ipc-rating-star--rating")
    rating = rating_tag.get_text(strip=True) if rating_tag else None

    vote_tag = block.find("span", class_="ipc-rating-star--voteCount")
    no_of_votes = vote_tag.get_text(strip=True) if vote_tag else None

    metadata_tags = block.find(class_="cli-title-metadata")
    spans = metadata_tags.find_all("span") if metadata_tags else []

    movie = {
        "image": image,
        "movie_link": movie_link,
        "title": title,
        "rating": rating,
        "no_of_votes": no_of_votes
    }

    for index, span in enumerate(spans):
        label = "year_released" if index == 0 else "length" if index == 1 else "movie_age_ratings"
        movie[label] = span.get_text(strip=True)

    list_movies.append(movie)


df_movies = pd.DataFrame(list_movies)

200


In [176]:
df_movies.head()

Unnamed: 0,image,movie_link,title,rating,no_of_votes,year_released,length,movie_age_ratings
0,https://m.media-amazon.com/images/M/MV5BMDAyY2...,/title/tt0111161/?ref_=chttp_t_1,1. The Shawshank Redemption,9.3,(3.1M),1994,2h 22m,R
1,https://m.media-amazon.com/images/M/MV5BNGEwYj...,/title/tt0068646/?ref_=chttp_t_2,2. The Godfather,9.2,(2.1M),1972,2h 55m,R
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,/title/tt0468569/?ref_=chttp_t_3,3. The Dark Knight,9.0,(3M),2008,2h 32m,PG-13
3,https://m.media-amazon.com/images/M/MV5BMDIxMz...,/title/tt0071562/?ref_=chttp_t_4,4. The Godfather Part II,9.0,(1.4M),1974,3h 22m,R
4,https://m.media-amazon.com/images/M/MV5BYjE4Nz...,/title/tt0050083/?ref_=chttp_t_5,5. 12 Angry Men,9.0,(932K),1957,1h 36m,Approved


In [177]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   image              25 non-null     object
 1   movie_link         25 non-null     object
 2   title              25 non-null     object
 3   rating             25 non-null     object
 4   no_of_votes        25 non-null     object
 5   year_released      25 non-null     object
 6   length             25 non-null     object
 7   movie_age_ratings  25 non-null     object
dtypes: object(8)
memory usage: 1.7+ KB


In [178]:
df_movies.to_csv("raw_data.csv")

### Project 4

In [187]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

urls = [
    'https://groww.in/us-stocks/nke',
    'https://groww.in/us-stocks/ko',
    'https://groww.in/us-stocks/msft',
    'https://groww.in/stocks/m-india-ltd',
    'https://groww.in/us-stocks/axp',
    'https://groww.in/us-stocks/amgn',
    'https://groww.in/us-stocks/aapl',
    'https://groww.in/us-stocks/ba',
    'https://groww.in/us-stocks/csco',
    'https://groww.in/us-stocks/gs',
    'https://groww.in/us-stocks/ibm',
    'https://groww.in/us-stocks/intc',
    'https://groww.in/us-stocks/jpm',
    'https://groww.in/us-stocks/mcd',
    'https://groww.in/us-stocks/crm',
    'https://groww.in/us-stocks/vz',
    'https://groww.in/us-stocks/v',
    'https://groww.in/us-stocks/wmt',
    'https://groww.in/us-stocks/dis'
]

all_data = []

for url in urls:
    print(f"Scraping: {url}")
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        company = soup.find('h1', class_='usph14Head displaySmall')
        price = soup.find('span', class_='uht141Pri contentPrimary displayBase')
        change = soup.find('div', class_='uht141Day bodyBaseHeavy contentNegative') or \
                 soup.find('div', class_='uht141Day bodyBaseHeavy contentPositive')
        volume_table = soup.find('table', class_='tb10Table borderPrimary width100 usp100NoBorder usp100Table')
        volume_row = volume_table.find_all('tr')[1] if volume_table else None

        company_text = company.text.strip() if company else None
        price_text = price.text.strip() if price else None
        change_text = change.text.strip() if change else None
        volume_text = volume_row.find_all('td')[2].text.strip() if volume_row else None

        row = [company_text, price_text, change_text, volume_text]
        all_data.append(row)

    except Exception as e:
        print(f"Skipping {url} due to error: {e}")

    time.sleep(5)

column_names = ["Company", "Price", "Change", "Volume"]
df = pd.DataFrame(all_data, columns=column_names)

df.head()


Scraping: https://groww.in/us-stocks/nke
Scraping: https://groww.in/us-stocks/ko
Scraping: https://groww.in/us-stocks/msft
Scraping: https://groww.in/stocks/m-india-ltd
Scraping: https://groww.in/us-stocks/axp
Scraping: https://groww.in/us-stocks/amgn
Scraping: https://groww.in/us-stocks/aapl
Scraping: https://groww.in/us-stocks/ba
Scraping: https://groww.in/us-stocks/csco
Scraping: https://groww.in/us-stocks/gs
Scraping: https://groww.in/us-stocks/ibm
Scraping: https://groww.in/us-stocks/intc
Scraping: https://groww.in/us-stocks/jpm
Scraping: https://groww.in/us-stocks/mcd
Scraping: https://groww.in/us-stocks/crm
Scraping: https://groww.in/us-stocks/vz
Scraping: https://groww.in/us-stocks/v
Scraping: https://groww.in/us-stocks/wmt
Scraping: https://groww.in/us-stocks/dis


Unnamed: 0,Company,Price,Change,Volume
0,Nike Inc,$62.37,+0.80(1.30%) 1D,70218.0
1,Coca-Cola Company The,$71.10,-0.06(0.08%) 1D,68339.0
2,Microsoft Corporation,$462.77,-0.20(0.04%) 1D,127079.0
3,,,,
4,American Express Co,$297.30,-0.09(0.03%) 1D,988.0


In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Company  16 non-null     object
 1   Price    16 non-null     object
 2   Change   16 non-null     object
 3   Volume   16 non-null     object
dtypes: object(4)
memory usage: 740.0+ bytes
