In [1]:
import lxml
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from requests import get

In [5]:
url1 = "https://www.imdb.com/search/title/?title_type=feature&release_date=1996-03-29,2018-09-24&sort=num_votes,desc"

In [13]:
class IMDB(object):
	"""docstring for IMDB"""
	def __init__(self, url):
		super(IMDB, self).__init__()
		page = get(url)

		self.soup = BeautifulSoup(page.content, 'lxml')

	def articleTitle(self):
		return self.soup.find("h1", class_="header").text.replace("\n","")

	def bodyContent(self):
		content = self.soup.find(id="main")
		return content.find_all("div", class_="lister-item mode-advanced")

	def movieData(self):
		movieFrame = self.bodyContent()
		movieTitle = []
		movieDate = []
		movieRunTime = []
		movieGenre = []
		movieRating = []
		movieScore = []
		movieDescription = []
		movieDirector = []
		movieStars = []
		movieVotes = []
		movieGross = []
		for movie in movieFrame:
			movieFirstLine = movie.find("h3", class_="lister-item-header")
			movieTitle.append(movieFirstLine.find("a").text)
			movieDate.append(re.sub(r"[()]","", movieFirstLine.find_all("span")[-1].text))
			try:
				movieRunTime.append(movie.find("span", class_="runtime").text[:-4])
			except:
				movieRunTime.append(np.nan)
			movieGenre.append(movie.find("span", class_="genre").text.rstrip().replace("\n","").split(","))
			try:
				movieRating.append(movie.find("strong").text)
			except:
				movieRating.append(np.nan)
			try:
				movieScore.append(movie.find("span", class_="metascore favorable").text.rstrip())
			except:
				movieScore.append(np.nan)
			movieDescription.append(movie.find_all("p", class_="text-muted")[-1].text.lstrip())
			movieCast = movie.find("p", class_="")

			try:
				casts = movieCast.text.replace("\n","").split('|')
				casts = [x.strip() for x in casts]
				casts = [casts[i].replace(j, "") for i,j in enumerate(["Director:", "Stars:"])]
				movieDirector.append(casts[0])
				movieStars.append([x.strip() for x in casts[1].split(",")])
			except:
				casts = movieCast.text.replace("\n","").strip()
				movieDirector.append(np.nan)
				movieStars.append([x.strip() for x in casts.split(",")])

			movieNumbers = movie.find_all("span", attrs={"name": "nv"})

			if len(movieNumbers) == 2:
				movieVotes.append(movieNumbers[0].text)
				movieGross.append(movieNumbers[1].text)
			elif len(movieNumbers) == 1:
				movieVotes.append(movieNumbers[0].text)
				movieGross.append(np.nan)
			else:
				movieVotes.append(np.nan)
				movieGross.append(np.nan)

		movieData = [movieTitle, movieDate, movieRunTime, movieGenre, movieRating, movieScore, movieDescription,
							movieDirector, movieStars, movieVotes, movieGross]
		return movieData

In [14]:
if __name__ == '__main__':
	site1 = IMDB(url1)
	print("Subject: ", site1.articleTitle())
	data = site1.movieData()
	for i in range(len(data)):
		print(data[i][:]) #Print the data

Subject:  Feature Film,Released between 1996-03-29 and 2018-09-24(Sorted by Number of Votes Descending) 
['The Dark Knight', 'Inception', 'Fight Club', 'The Matrix', 'The Lord of the Rings: The Fellowship of the Ring', 'The Lord of the Rings: The Return of the King', 'The Dark Knight Rises', 'The Lord of the Rings: The Two Towers', 'Interstellar', 'Django Unchained', 'Gladiator', 'Batman Begins', 'The Avengers', 'Inglourious Basterds', 'Saving Private Ryan', 'The Departed', 'The Prestige', 'The Wolf of Wall Street', 'The Green Mile', 'Avatar', 'Memento', 'Shutter Island', 'American Beauty', 'Guardians of the Galaxy', 'Titanic', 'American History X', 'V for Vendetta', 'Pirates of the Caribbean: The Curse of the Black Pearl', 'Kill Bill: Vol. 1', 'WALL·E', 'Finding Nemo', 'Iron Man', 'Up', 'The Truman Show', 'The Sixth Sense', 'Eternal Sunshine of the Spotless Mind', 'Deadpool', 'Mad Max: Fury Road', 'Star Wars: Episode VII - The Force Awakens', 'Good Will Hunting', 'The Hunger Games', '

In [8]:
data.head()

AttributeError: 'list' object has no attribute 'head'

In [15]:
data1 = pd.DataFrame(data)

In [16]:
data1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,The Dark Knight,Inception,Fight Club,The Matrix,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Return of the King,The Dark Knight Rises,The Lord of the Rings: The Two Towers,Interstellar,Django Unchained,...,The Hunger Games,Gone Girl,No Country for Old Men,A Beautiful Mind,Catch Me If You Can,"Monsters, Inc.",Avengers: Infinity War,Slumdog Millionaire,Snatch,Gravity
1,2008,2010,1999,1999,2001,2003,2012,2002,2014,2012,...,2012,2014,2007,2001,2002,2001,2018,2008,2000,2013
2,152,148,139,136,178,201,164,179,169,165,...,142,149,122,135,141,92,149,120,104,91
3,"[Action, Crime, Drama]","[Action, Adventure, Sci-Fi]",[Drama],"[Action, Sci-Fi]","[Action, Adventure, Drama]","[Adventure, Drama, Fantasy]","[Action, Adventure]","[Adventure, Drama, Fantasy]","[Adventure, Drama, Sci-Fi]","[Drama, Western]",...,"[Action, Adventure, Sci-Fi]","[Drama, Mystery, Thriller]","[Crime, Drama, Thriller]","[Biography, Drama]","[Biography, Crime, Drama]","[Animation, Adventure, Comedy]","[Action, Adventure, Sci-Fi]","[Drama, Romance]","[Comedy, Crime]","[Drama, Sci-Fi, Thriller]"
4,9.0,8.8,8.8,8.7,8.8,8.9,8.4,8.7,8.6,8.4,...,7.2,8.1,8.1,8.2,8.1,8.0,8.4,8.0,8.3,7.7


In [17]:
data1 = data1.transpose()
data1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,The Dark Knight,2008,152,"[Action, Crime, Drama]",9.0,84,When the menace known as the Joker wreaks havo...,Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",2224414,$534.86M
1,Inception,2010,148,"[Action, Adventure, Sci-Fi]",8.8,74,A thief who steals corporate secrets through t...,Christopher Nolan,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...",1981548,$292.58M
2,Fight Club,1999,139,[Drama],8.8,66,An insomniac office worker and a devil-may-car...,David Fincher,"[Brad Pitt, Edward Norton, Meat Loaf, Zach Gre...",1794361,$37.03M
3,The Matrix,1999,136,"[Action, Sci-Fi]",8.7,73,A computer hacker learns from mysterious rebel...,"Directors:Lana Wachowski, Lilly Wachowski","[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",1619665,$171.48M
4,The Lord of the Rings: The Fellowship of the Ring,2001,178,"[Action, Adventure, Drama]",8.8,92,A meek Hobbit from the Shire and eight compani...,Peter Jackson,"[Elijah Wood, Ian McKellen, Orlando Bloom, Sea...",1609082,$315.54M


In [18]:
data1.columns = ['Title', 'Year', 'Runtime', 'Genre', 'Rating', 'Score', 'Description', 'Director', 'Actors', 'Votes', 'Gross']
data1.head()

Unnamed: 0,Title,Year,Runtime,Genre,Rating,Score,Description,Director,Actors,Votes,Gross
0,The Dark Knight,2008,152,"[Action, Crime, Drama]",9.0,84,When the menace known as the Joker wreaks havo...,Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",2224414,$534.86M
1,Inception,2010,148,"[Action, Adventure, Sci-Fi]",8.8,74,A thief who steals corporate secrets through t...,Christopher Nolan,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...",1981548,$292.58M
2,Fight Club,1999,139,[Drama],8.8,66,An insomniac office worker and a devil-may-car...,David Fincher,"[Brad Pitt, Edward Norton, Meat Loaf, Zach Gre...",1794361,$37.03M
3,The Matrix,1999,136,"[Action, Sci-Fi]",8.7,73,A computer hacker learns from mysterious rebel...,"Directors:Lana Wachowski, Lilly Wachowski","[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",1619665,$171.48M
4,The Lord of the Rings: The Fellowship of the Ring,2001,178,"[Action, Adventure, Drama]",8.8,92,A meek Hobbit from the Shire and eight compani...,Peter Jackson,"[Elijah Wood, Ian McKellen, Orlando Bloom, Sea...",1609082,$315.54M
