In [1]:
import requests
from bs4 import BeautifulSoup as bs
import csv
import re
import pandas as pd

## 1) Wikipedia Main page

In [2]:
def wiki_scrape(url):
    """This function takes in the main page URL of Wikipedia and scrapes the Header tags, stores them in a csv file"""
    #Requesting html
    source = requests.get(url).text
    #Pasing the html
    soup = bs(source, 'lxml')
    #creating CSV
    csv_file = open('wiki_headers.csv', 'w')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['Sr', 'Header_Tags'])
    #Extracting header tags
    n=1
    for i in soup.find_all('h2', class_ = 'mp-h2'):
        csv_writer.writerow([n, i.text])
        n=n+1
    csv_file.close()
    df=pd.read_csv('wiki_headers.csv',encoding = 'unicode_escape')
    print(df)

In [3]:
wiki_scrape('https://en.wikipedia.org/wiki/Main_Page')

   Sr                    Header_Tags
0   1  From today's featured article
1   2               Did you know ...
2   3                    In the news
3   4                    On this day
4   5       Today's featured picture
5   6       Other areas of Wikipedia
6   7    Wikipedia's sister projects
7   8            Wikipedia languages


## 2) IMDB top 100

In [4]:
def imdb_all(url):
    """This function will extract top 100 movies in descending order as listed on IMDB, you must pass page 1 URL"""
    source1 = requests.get(url).text
    soup = bs(source1, 'lxml')
    csv_file = open('imdb_top100.csv', 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Name', 'Year', 'IMDB Ratings'])
    for i in soup.find_all('div', class_ = 'lister-item-content'):
        name = i.h3.a.text
        year = i.find('span', class_='lister-item-year text-muted unbold').text[1:5]
        ratings = i.find('div', class_='inline-block ratings-imdb-rating').strong.text
        writer.writerow([name, year, ratings])
    url2 = url+'&start=51&ref_=adv_nxt'
    source2 = requests.get(url2).text
    soup = bs(source2, 'lxml')
    for i in soup.find_all('div', class_ = 'lister-item-content'):
        name = i.h3.a.text
        year = i.find('span', class_='lister-item-year text-muted unbold').text[1:5]
        ratings = i.find('div', class_='inline-block ratings-imdb-rating').strong.text
        writer.writerow([name, year, ratings])
    csv_file.close()
    df = pd.read_csv('imdb_top100.csv',encoding = 'unicode_escape')
    print(df)

In [5]:
imdb_all('https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc')

                                 Name  Year  IMDB Ratings
0            The Shawshank Redemption  1994           9.3
1                       The Godfather  1972           9.2
2                     The Dark Knight  2008           9.0
3              The Godfather: Part II  1974           9.0
4                        12 Angry Men  1957           9.0
..                                ...   ...           ...
95                            Vertigo  1958           8.3
96                Singin' in the Rain  1952           8.3
97                Ladri di biciclette  1948           8.3
98                       Citizen Kane  1941           8.3
99  M - Eine Stadt sucht einen Mörder  1931           8.3

[100 rows x 3 columns]


## IMDB Indian movies top 100

In [6]:
def imdb_indian(url):
    source = requests.get(url).text
    soup = bs(source, 'lxml')
    csv_file = open("top_100_indian.csv", 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Name', 'Year', 'Ratings', 'Votes'])
    n=0
    for i,j,k in zip(soup.find_all('td', class_ = 'titleColumn'),soup.find_all('td', class_='ratingColumn imdbRating'),soup.find_all('td', class_='ratingColumn imdbRating')):
        try:
            name=i.text.strip()[3:].strip()[:-7]
        except:
            name = None
        try:
            year=i.text[-6:-2]
        except:
            year = None
        try:
            rating = j.text.strip()
        except:
            rating = None
        try:
            votes = k.strong['title']
            votes = re.findall('\d+', votes[3:])
            votes = votes[0]+votes[1]
        except:
            votes = None
        writer.writerow([name, year, rating, votes])
        if n==99:
            break
        n=n+1
    csv_file.close()
    df = pd.read_csv('top_100_indian.csv', encoding = 'unicode_escape')
    print(df)

In [7]:
imdb_indian("https://www.imdb.com/india/top-rated-indian-movies/")

                 Name  Year  Ratings  Votes
0     Pather Panchali  1955      8.5  23359
1            Gol Maal  1979      8.5  17923
2            Ratsasan  2018      8.5  23671
3             Nayakan  1987      8.5  16142
4          Anbe Sivam  2003      8.5  16626
..                ...   ...      ...    ...
95         Bommarillu  2006      8.0   8542
96              Lucia  2013      8.0  11470
97            Maqbool  2003      8.0   9499
98             Bombay  1995      8.0  11278
99  .\r\n      Omkara  2006      8.0  19138

[100 rows x 4 columns]


## 4) Book Page scraping

In [8]:
#This function doesn't scrape 'https://bookpage.com/', but 'https://bookpage.com/reviews'
def bookpage(url):
    """This function will scrape 5 books from bookpage.com/reviews, and store the book name, author, genre and summary
    in a csv file"""
    source = requests.get(url).text
    soup = bs(source, 'lxml')
    csv_file = open("books.csv", 'w', encoding = 'utf-8')
    writer = csv.writer(csv_file)
    writer.writerow(['Book Name', 'Author', 'Genre', 'Summary'])
    n=1
    for i in soup.find_all('div', class_='flex-article-content'):
        try:
            book = i.a.text
            print(book)
        except:
            book = None
        try:
            author = i.find('p', class_='sans bold').text.strip()
            print(author)
        except:
            author = None
        try:
            summary = [a.text.strip() for a in i.find_all('p') if len(a.text.strip())>70]
            summary = summary[0]
            print(summary)
        except:
            summary = None
        try:
            genre = i.find('p', class_ = 'genre-links hidden-phone').text.replace('\n', '')
            print(genre)
        except:
            genre = None
        writer.writerow([book, author, genre, summary])  
        if n==5:
            break
        n=n+1
    csv_file.close()
    df = pd.read_csv('books.csv', encoding = 'unicode-escape')
    print(df)

In [9]:
bookpage('https://bookpage.com/reviews')

The Children's Train
Viola Ardone, Clarissa Botsford
The displacement of children is a vexing problem in international and national politics. Italian author Viola Ardone’s novel explores issues surrounding children who are separated from their parents, but in this case, the families willingly send their youngsters away to live in the care of...
Fiction / Historical Fiction
The Lion of Mars
Jennifer L. Holm
In the year 2091, millions of miles away from Earth, 11-year-old Bell and a handful of other kids are growing up on Mars. Sent there as orphaned infants, they have never known another life, another home or another family. Along with several adults, they make up the American settlement, where...
Children's / Middle Grade
What Could Be Saved
Liese O'Halloran Schwarz
At the heart of family novels there’s often a secret, and how that secret plays out in family dynamics forms the heart of the story. Liese O’Halloran Schwarz’s third novel is no exception, with several secrets reverberating

## 5) ICC men's ODI

In [10]:
def icc_mens(url):
    #Part 1: Team Rankings
    source=requests.get(url).text
    soup=bs(source, 'lxml')
    csv_file = open('team_odi.csv', 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Rank','Team', 'Matches', 'Points', 'Rating'])
    tbody=soup.find('tbody')
    a=0
    m=0
    n=1
    k=0
    rank=1
    for i in tbody.find_all('span', class_='u-hide-phablet'):
        team = i.text
        if a == 0:
            match = tbody.find('td', class_='rankings-block__banner--matches').text
            points = tbody.find('td', class_='rankings-block__banner--points').text
            rating = tbody.find('td', class_='rankings-block__banner--rating u-text-right').text.replace('\n', '').strip()
            writer.writerow([rank, team, match, points, rating])
            rank=rank+1
            a=a+1
            continue
        match = tbody.find_all('td',class_="table-body__cell u-center-text")[m].text
        points = tbody.find_all('td',class_="table-body__cell u-center-text")[n].text
        m=m+2
        n=n+2
        rating = tbody.find_all('td', class_="table-body__cell u-text-right rating")[k].text
        writer.writerow([rank,team, match, points, rating])
        k=k+1
        rank=rank+1
        if k == 9:
            break
    csv_file.close()
    df = pd.read_csv('team_odi.csv', encoding = 'unicode-escape')
    print(df)
    #Part 2: ODI batsmen ranking
    source = requests.get('https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting').text
    soup = bs(source, 'lxml')
    csv_file = open('batsmen.csv', 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Rank','Player','Team', 'Rating'])
    table = soup.find('table', class_ = 'table rankings-table')
    player = table.find('div', class_="rankings-block__banner--name-large").text
    team = table.find('div' ,class_="rankings-block__banner--nationality").text.strip()
    rating = table.find('div', class_="rankings-block__banner--rating").text
    rank = 1
    writer.writerow([rank, player, team, rating])
    for i, j, k  in zip(table.find_all('td', class_="table-body__cell rankings-table__name name"),
                    table.find_all('span', class_="table-body__logo-text"),
                    table.find_all('td' ,class_="table-body__cell rating")):
        player = i.text.strip()
        team = j.text
        rating = k.text
        rank = rank+1
        writer.writerow([rank, player, team, rating])
        if rank == 10:
            break
    csv_file.close()
    df=pd.read_csv('batsmen.csv', encoding = 'unicode-escape')
    print(df)
    #Part 3: ODI bowler ranking
    source = requests.get('https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling').text
    soup = bs(source, 'lxml')
    table = soup.find('table', class_ = 'table rankings-table')
    csv_file = open('bowler.csv', 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Rank','Player','Team', 'Rating'])
    player = table.find('div', class_="rankings-block__banner--name-large").text
    team = table.find('div' ,class_="rankings-block__banner--nationality").text.strip()
    rating = table.find('div', class_="rankings-block__banner--rating").text
    rank = 1
    writer.writerow([rank, player, team, rating])
    for i, j, k  in zip(table.find_all('td', class_="table-body__cell rankings-table__name name"),
                    table.find_all('span', class_="table-body__logo-text"),
                    table.find_all('td' ,class_="table-body__cell rating")):
        player = i.text.strip()
        team = j.text
        rating = k.text
        rank = rank+1
        writer.writerow([rank, player, team, rating])
        if rank == 10:
            break
    csv_file.close()
    df=pd.read_csv('bowler.csv', encoding = 'unicode-escape')
    print(df)

In [11]:
icc_mens('https://www.icc-cricket.com/rankings/mens/team-rankings/odi')

   Rank          Team  Matches Points  Rating
0     1       England       44  5,405     123
1     2         India       52  6,102     117
2     3   New Zealand       32  3,716     116
3     4     Australia       39  4,344     111
4     5  South Africa       31  3,345     108
5     6      Pakistan       35  3,490     100
6     7    Bangladesh       34  2,989      88
7     8     Sri Lanka       39  3,297      85
8     9   West Indies       43  3,285      76
9    10   Afghanistan       28  1,549      55
   Rank               Player Team  Rating
0     1          Virat Kohli  IND     870
1     2         Rohit Sharma  IND     842
2     3           Babar Azam  PAK     837
3     4          Ross Taylor   NZ     818
4     5          Aaron Finch  AUS     791
5     6  Francois du Plessis   SA     790
6     7         David Warner  AUS     773
7     8      Kane Williamson   NZ     765
8     9      Quinton de Kock   SA     755
9    10       Jonny Bairstow  ENG     754
   Rank            Player Team  

## 6) ICC Women's ODI

In [12]:
def icc_women(url):
    source = requests.get(url).text
    soup = bs(source, 'lxml')
    tbody = soup.find('tbody')
    csv_file = open('team_odi.csv', 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Rank','Team', 'Matches', 'Points', 'Rating'])
    a=0
    m=0
    n=1
    k=0
    rank=1
    for i in tbody.find_all('span', class_='u-hide-phablet'):
        team = i.text
        if a == 0:
            match = tbody.find('td', class_='rankings-block__banner--matches').text
            points = tbody.find('td', class_='rankings-block__banner--points').text
            rating = tbody.find('td', class_='rankings-block__banner--rating u-text-right').text.replace('\n', '').strip()
            writer.writerow([rank, team, match, points, rating])
            rank=rank+1
            a=a+1
            continue
        match = tbody.find_all('td',class_="table-body__cell u-center-text")[m].text
        points = tbody.find_all('td',class_="table-body__cell u-center-text")[n].text
        m=m+2
        n=n+2
        rating = tbody.find_all('td', class_="table-body__cell u-text-right rating")[k].text
        writer.writerow([rank,team, match, points, rating])
        k=k+1
        rank=rank+1
        if k == 9:
            break
    csv_file.close()
    df = pd.read_csv('team_odi.csv', encoding = 'unicode-escape')
    print(df)
    #Part 2
    source = requests.get('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/batting').text
    soup = bs(source, 'lxml')
    csv_file = open('women_batting.csv', 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Rank','Player','Team', 'Rating'])
    table = soup.find('table', class_ = 'table rankings-table')
    player = table.find('div', class_="rankings-block__banner--name-large").text
    team = table.find('div' ,class_="rankings-block__banner--nationality").text.strip()
    rating = table.find('div', class_="rankings-block__banner--rating").text
    rank = 1
    writer.writerow([rank, player, team, rating])
    for i, j, k  in zip(table.find_all('td', class_="table-body__cell rankings-table__name name"),
                    table.find_all('span', class_="table-body__logo-text"),
                    table.find_all('td' ,class_="table-body__cell rating")):
        player = i.text.strip()
        team = j.text
        rating = k.text
        rank = rank+1
        writer.writerow([rank, player, team, rating])
        if rank == 10:
            break
    csv_file.close()
    df=pd.read_csv('women_batting.csv', encoding = 'unicode-escape')
    print(df)
    #Part 3
    source = requests.get('https://www.icc-cricket.com/rankings/womens/player-rankings/odi/all-rounder').text
    soup = bs(source, 'lxml')
    table = soup.find('table', class_ = 'table rankings-table')
    csv_file = open('womens_allrounder.csv', 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Rank','Player','Team', 'Rating'])
    player = table.find('div', class_="rankings-block__banner--name-large").text
    team = table.find('div' ,class_="rankings-block__banner--nationality").text.strip()
    rating = table.find('div', class_="rankings-block__banner--rating").text
    rank = 1
    writer.writerow([rank, player, team, rating])
    for i, j, k  in zip(table.find_all('td', class_="table-body__cell rankings-table__name name"),
                    table.find_all('span', class_="table-body__logo-text"),
                    table.find_all('td' ,class_="table-body__cell rating")):
        player = i.text.strip()
        team = j.text
        rating = k.text
        rank = rank+1
        writer.writerow([rank, player, team, rating])
        if rank == 10:
            break
    csv_file.close()
    df = pd.read_csv('womens_allrounder.csv', encoding = 'unicode-escape')
    print(df)

In [13]:
icc_women('https://www.icc-cricket.com/rankings/womens/team-rankings/odi')

   Rank          Team  Matches Points  Rating
0     1     Australia       15  2,436     162
1     2         India       15  1,812     121
2     3       England       14  1,670     119
3     4  South Africa       16  1,713     107
4     5   New Zealand       15  1,384      92
5     6   West Indies       12  1,025      85
6     7      Pakistan       12    927      77
7     8    Bangladesh        5    306      61
8     9     Sri Lanka       11    519      47
9    10       Ireland        2     25      13
   Rank             Player Team  Rating
0     1        Meg Lanning  AUS     749
1     2    Stafanie Taylor   WI     746
2     3       Alyssa Healy  AUS     741
3     4    Smriti Mandhana  IND     732
4     5  Amy Satterthwaite   NZ     723
5     6     Tammy Beaumont  ENG     716
6     7       Ellyse Perry  AUS     691
7     8        Lizelle Lee   SA     690
8     9    Laura Wolvaardt   SA     689
9    10        Mithali Raj  IND     687
   Rank            Player Team  Rating
0     1      El

## 7) Mobile detail scraping - amazon.in

In [14]:
def amazon(url):
    csv_file = open('mobiles1.csv', 'w', encoding = 'utf-8')
    writer = csv.writer(csv_file)
    writer.writerow(["Phone", "Rating/5","Votes", "Price", 'Img_URL'])
    source = requests.get(url).text
    soup = bs(source, 'lxml')
    for body in soup.find_all('span', class_ = "celwidget slot=MAIN template=SEARCH_RESULTS widgetId=search-results"):
        name = body.find('span', class_="a-size-medium a-color-base a-text-normal").text
        rating = body.find('div', class_="a-row a-size-small").text.split()[0] + '/5'
        votes = body.find('div', class_="a-row a-size-small").text.split()[5]
        price = body.find('span', class_="a-offscreen").text
        img = body.find('img', class_="s-image")['src']
        writer.writerow([name, rating, votes, price, img])
    csv_file.close()
    df = pd.read_csv('mobiles1.csv', encoding = 'unicode-escape')
    print(df)

## 8) Weather scraping

In [15]:
def weather(url):
    source = requests.get(url).text
    soup = bs(source, 'lxml')
    csv_file = open('weather.csv', 'w')
    writer = csv.writer(csv_file)
    writer.writerow(['Period', 'Temperature', 'Short Description', 'Summary'])
    for i in soup.find_all('div', class_="row row-odd row-forecast"):
        period = i.b.text
        desc = i.find('div', class_ = 'col-sm-10 forecast-text').text
        short_desc = desc.split(',')[0]
        temp = re.findall('\d+', desc)[0]
        writer.writerow([period, temp, short_desc, desc])
    csv_file.close()
    df = pd.read_csv('weather.csv')
    print(df)

In [16]:
weather('https://forecast.weather.gov/MapClick.php?lat=37.777120000000025&lon=-122.41963999999996')

         Period  Temperature                            Short Description  \
0         Today            9  Patchy fog between 9am and 10am.  Otherwise   
1      Thursday           10           Patchy fog before 10am.  Otherwise   
2        Friday            8            Patchy fog before 8am.  Otherwise   
3      Saturday            8            Patchy fog before 8am.  Otherwise   
4        Sunday           65                                 Mostly sunny   
5  M.L.King Day           68                                        Sunny   
6       Tuesday           65                                        Sunny   

                                             Summary  
0  Patchy fog between 9am and 10am.  Otherwise, p...  
1  Patchy fog before 10am.  Otherwise, mostly sun...  
2  Patchy fog before 8am.  Otherwise, partly sunn...  
3  Patchy fog before 8am.  Otherwise, mostly sunn...  
4                 Mostly sunny, with a high near 65.  
5                        Sunny, with a high near 68. 