<a href="https://colab.research.google.com/github/Fenoemos/data-science-bootcamp-8/blob/main/Web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web scraping to get data and clean it to create dataframe
## Data from IMDb

In [None]:
# install new library on google colab
!pip install gazpacho

Collecting gazpacho
  Downloading gazpacho-1.1.tar.gz (7.9 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gazpacho
  Building wheel for gazpacho (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gazpacho: filename=gazpacho-1.1-py3-none-any.whl size=7461 sha256=cf143321d63ece180cb55001d2534f3e093db343f3d116d95ea094438a7dabb4
  Stored in directory: /root/.cache/pip/wheels/9b/bf/9f/8c8849499462415fa5cdf0d9edb1103c189bdbece90c51488e
Successfully built gazpacho
Installing collected packages: gazpacho
Successfully installed gazpacho-1.1


In [None]:
# import function (from gazpacho and requests)
from gazpacho import Soup
import requests

In [None]:
# IMDb site will be used as an example
url = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"

# get a request to url and store it as html
html = requests.get(url)

# check status (if 200, Okay)
html.status_code

200

In [None]:
# use Soup to manage html site
imdb = Soup(html.text) # Now, imdb is Soup object.

### Get movie titles

In [None]:
# explore by using 'Inspect" and find wanted elements
# then, use .find to get that
titles = imdb.find("h3", {"class": "lister-item-header"}) # to get movie name from web scraping

In [None]:
# try to print first title to explore, if it need to be clean or not.
titles[0]


<h3 class="lister-item-header">
  <span class="lister-item-index unbold text-primary">1.</span>
  <a href="/title/tt0111161/">The Shawshank Redemption</a>
  <span class="lister-item-year text-muted unbold">(1994)</span>
</h3>

In [None]:
# For loop to clean data

clean_titles = []

for title in titles:
    clean_titles.append(title.strip()) # .strip() use to remove html element

In [None]:
clean_titles

['1. The Shawshank Redemption (1994)',
 '2. The Godfather (1972)',
 '3. The Dark Knight (2008)',
 "4. Schindler's List (1993)",
 '5. The Lord of the Rings: The Return of the King (2003)',
 '6. 12 Angry Men (1957)',
 '7. The Godfather Part II (1974)',
 '8. Pulp Fiction (1994)',
 '9. Fight Club (1999)',
 '10. The Lord of the Rings: The Fellowship of the Ring (2001)',
 '11. Inception (2010)',
 '12. Forrest Gump (1994)',
 '13. The Lord of the Rings: The Two Towers (2002)',
 '14. The Good, the Bad and the Ugly (1966)',
 '15. Spider-Man: Across the Spider-Verse (2023)',
 '16. Interstellar (2014)',
 '17. Goodfellas (1990)',
 '18. The Matrix (1999)',
 "19. One Flew Over the Cuckoo's Nest (1975)",
 '20. Star Wars: Episode V - The Empire Strikes Back (1980)',
 '21. Oppenheimer (2023)',
 '22. Se7en (1995)',
 '23. The Silence of the Lambs (1991)',
 '24. Saving Private Ryan (1998)',
 '25. Star Wars: Episode IV - A New Hope (1977)',
 '26. The Green Mile (1999)',
 '27. Spirited Away (2001)',
 '28. Te

### Get rating

In [None]:
# Rating are stored in "div" with "class": "ratings-imdb-rating"
ratings = imdb.find("div", {"class": "ratings-imdb-rating"})

# try to print first title to explore, if it need to be clean or not.
print(ratings[0])
# it still includes html element

<div class="inline-block ratings-imdb-rating" name="ir" data-value="9.3">
  <span class="global-sprite rating-star imdb-rating"/>
  <strong>9.3</strong>
</div>


In [None]:
# For loop to clean data

clean_ratings = []

for rating in ratings:
    clean_ratings.append(float(rating.strip())) # .strip() use to remove html element

In [None]:
clean_ratings

[9.3,
 9.2,
 9.0,
 9.0,
 9.0,
 9.0,
 9.0,
 8.9,
 8.8,
 8.8,
 8.8,
 8.8,
 8.8,
 8.8,
 8.7,
 8.7,
 8.7,
 8.7,
 8.7,
 8.7,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.6,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5,
 8.5]

### Get year

In [None]:
## Already get 'year', but it is with clean_titles
# First, get year to clean_years
clean_years = []

for clean_title in clean_titles:
    clean_years.append((clean_title.strip()[-5:-1])) # index -5 mean '(' and index -1 mean ')'

clean_years

['1994',
 '1972',
 '2008',
 '1993',
 '2003',
 '1957',
 '1974',
 '1994',
 '1999',
 '2001',
 '2010',
 '1994',
 '2002',
 '1966',
 '2023',
 '2014',
 '1990',
 '1999',
 '1975',
 '1980',
 '2023',
 '1995',
 '1991',
 '1998',
 '1977',
 '1999',
 '2001',
 '1991',
 '2002',
 '1997',
 '1946',
 '1954',
 '1962',
 '2000',
 '2006',
 '1979',
 '2019',
 '2006',
 '1985',
 '2014',
 '1960',
 '1994',
 '2012',
 '1995',
 '1994',
 '2002',
 '1998',
 '1942',
 '1988',
 '2011']

In [None]:
# Second, remove year from clean_titles
clean_titles2 = []

for clean_title in clean_titles:
    clean_titles2.append((clean_title.strip()[3:-7])) # index -5 mean '(' and index -7 mean 'last white space before ('

clean_titles2

['The Shawshank Redemption',
 'The Godfather',
 'The Dark Knight',
 "Schindler's List",
 'The Lord of the Rings: The Return of the King',
 '12 Angry Men',
 'The Godfather Part II',
 'Pulp Fiction',
 'Fight Club',
 ' The Lord of the Rings: The Fellowship of the Ring',
 ' Inception',
 ' Forrest Gump',
 ' The Lord of the Rings: The Two Towers',
 ' The Good, the Bad and the Ugly',
 ' Spider-Man: Across the Spider-Verse',
 ' Interstellar',
 ' Goodfellas',
 ' The Matrix',
 " One Flew Over the Cuckoo's Nest",
 ' Star Wars: Episode V - The Empire Strikes Back',
 ' Oppenheimer',
 ' Se7en',
 ' The Silence of the Lambs',
 ' Saving Private Ryan',
 ' Star Wars: Episode IV - A New Hope',
 ' The Green Mile',
 ' Spirited Away',
 ' Terminator 2: Judgment Day',
 ' City of God',
 ' Life Is Beautiful',
 " It's a Wonderful Life",
 ' Seven Samurai',
 ' Harakiri',
 ' Gladiator',
 ' The Departed',
 ' Alien',
 ' Parasite',
 ' The Prestige',
 ' Back to the Future',
 ' Whiplash',
 ' Psycho',
 ' Léon: The Profess

### Create dataframe

In [None]:
# import pandas
import pandas as pd

In [None]:
# create dataframe
movie_database = pd.DataFrame(data = {
    "title": clean_titles2,
    "rating": clean_ratings,
    "years": clean_years
})

In [None]:
# print first five rows
movie_database.head()

Unnamed: 0,title,rating,years
0,The Shawshank Redemption,9.3,1994
1,The Godfather,9.2,1972
2,The Dark Knight,9.0,2008
3,Schindler's List,9.0,1993
4,The Lord of the Rings: The Return of the King,9.0,2003
