In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

In [2]:
base_link = "https://www.themoviedb.org/tv"

In [3]:
needed_headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
response = requests.get("https://www.themoviedb.org/movie", headers = needed_headers )

In [4]:
response.status_code

200

In [5]:
dwn_content = response.text
len(dwn_content)

195421

In [6]:
test_doc = BeautifulSoup(response.text, 'html.parser')

In [7]:
test_doc.find('title')

<title>Popular Movies — The Movie Database (TMDB)</title>

In [8]:
test_doc.find('img')

<img alt="The Movie Database (TMDB)" height="20" src="/assets/2/v4/logos/v2/blue_short-8e7b30f73a4020692ccca9c88bafe5dcb6f8a62a4c6bc55cd9ba82bb2cd95f6c.svg" width="154"/>

In [9]:
def cols_dict():
    dic = {  
            'title': [],
            'rating': [], 
            'release_date':[], 
            'language': [],
            'current_season': [],
            'current_season_Episodes': [], 
            'runtime': [],
            'director': [],
            'genre': [],
            'cast': [],
            'Keywords': [],
            'type': []
          }
    return dic

In [10]:
def get_page_content(url):
    # In this case , we are going to give request.get function headers to avoid the Status code Error 403

    get_headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
    response_page = requests.get(url, headers = get_headers )
    # we are going to raise exception here if status code gives any value other than 200.
    if not response_page.ok:
        raise Exception ("Failed to request the data. Status Code:- {}".format(response_page.status_code))
    else:
        page_content = response_page.text
        doc_page = BeautifulSoup(page_content, "html.parser")
        return doc_page

In [11]:
def get_genres(doc2):
    tag_genre = doc2.find('span', {"class": "genres"})
    tag_genre_list = tag_genre.find_all('a')

    check_genre =[]
    for tag in tag_genre_list:
        check_genre.append(tag.text)

    return check_genre

In [12]:
def get_cast(doc2_page):
    cast_tags = doc2_page.find_all('li', {'class': 'card'})
    cast_lis = []
    
    for t in cast_tags:
         cast_lis.append(t.p.text)
    
    return cast_lis

In [13]:
def get_Keywords(doc3_page):
    Keywords_tags = doc3_page.find('section', {'class': 'keywords right_column'})
    Keywords_list = Keywords_tags.find_all('a')
    
    check_Keywords = []
    for t in Keywords_list:
         check_Keywords.append(t.text)
    
    return Keywords_list

In [14]:
%%time
url_address = base_link

dic =  cols_dict()
for j in range(1, 501):
    next_url = url_address + '?page={}'.format(j)
    get_headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
    response_page = requests.get(next_url, headers = get_headers )
    # we are going to raise exception here if status code gives any value other than 200.
    if not response_page.ok:
        print('page: ',j, "Failed to request the data. Status Code:- {}".format(response_page.status_code))
    else:
        page_content = response_page.text
        doc = BeautifulSoup(page_content, "html.parser")
        number_of_TVshows = len(doc.find_all('div', {'class': 'card style_1'}))
        
        for i in range(number_of_TVshows): 
            
            try:
                dic['title'].append(doc.find_all('div', {'class': 'card style_1'})[i].h2.text)
            except AttributeError:
                dic['title'].append('nan')

            try:
                dic['rating'].append(doc.find_all('div', {"user_score_chart"})[i]['data-percent'])
            except AttributeError:
                dic['rating'].append('nan')

            try:
                dic['release_date'].append(doc.find_all('div', {'class': 'card style_1'})[i].p.text)
            except AttributeError:
                dic['release_date'].append('nan')

            TVshow_url = base_link[:-3] + doc.find_all('div', {'class': 'card style_1'})[i].h2.a['href']
            TVshow_page = get_page_content(TVshow_url)

            dic['genre'].append(get_genres(TVshow_page))
            dic['cast'].append(get_cast(TVshow_page))
            dic['Keywords'].append(get_Keywords(TVshow_page))

            try:
                dic['runtime'].append(TVshow_page.find('span', {'class': 'runtime'}).text[16:-12])
            except AttributeError:
                dic['runtime'].append('nan')

            try:
                dic['current_season'].append(TVshow_page.find_all('div' , {'class': 'flex'})[1].h2.text)
            except (AttributeError, IndexError):
                dic['current_season'].append('nan')

            try:
                dic['current_season_Episodes'].append(TVshow_page.find_all('div' , {'class': 'flex'})[1].h4.text[7:])
            except (AttributeError, IndexError):
                dic['current_season_Episodes'].append('nan')

            try:
                dic['language'].append(TVshow_page.find('section', {"class": "facts left_column"}).select("p:nth-of-type(4)")[0].text)
            except AttributeError:
                dic['language'].append('nan')

            try:
                dic['director'].append(TVshow_page.find('li', {'class':'profile'}).a.get_text())
            except AttributeError:
                dic['director'].append('nan')
                
            try:
                dic['type'].append(TVshow_page.find('section', {"class": "facts left_column"}).select("p:nth-of-type(3)")[0].text)
            except AttributeError:
                dic['type'].append('nan')
            
            

Wall time: 2h 11min 22s


In [15]:
df = pd.DataFrame({k:pd.Series(v) for k,v in dic.items()})

In [26]:
j,i

(500, 19)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   title                    10000 non-null  object
 1   rating                   10000 non-null  object
 2   release_date             10000 non-null  object
 3   language                 10000 non-null  object
 4   current_season           10000 non-null  object
 5   current_season_Episodes  10000 non-null  object
 6   runtime                  10000 non-null  object
 7   director                 10000 non-null  object
 8   genre                    10000 non-null  object
 9   cast                     10000 non-null  object
 10  Keywords                 10000 non-null  object
 11  type                     10000 non-null  object
dtypes: object(12)
memory usage: 937.6+ KB


In [19]:
df.to_csv('tv_all.csv', index =False)

In [21]:
df_01 = pd.read_csv('tv_all.csv')
df_01.shape

(10000, 12)

In [25]:
df_01.head()

Unnamed: 0,title,rating,release_date,language,current_season,current_season_Episodes,runtime,director,genre,cast,Keywords,type
0,Peacemaker,86.0,"Jan 13, 2022",Original Language English,Season 1,8 Episodes,46m,James Gunn,"['Action & Adventure', 'Comedy', 'Sci-Fi & Fan...","['John Cena', 'Danielle Brooks', 'Freddie Stro...","[<a class=""rounded"" href=""/keyword/9715-superh...",Type Scripted
1,Euphoria,84.0,"Jun 16, 2019",Original Language English,Season 2,6 Episodes,1h,Sam Levinson,['Drama'],"['Zendaya', 'Hunter Schafer', 'Jacob Elordi', ...","[<a class=""rounded"" href=""/keyword/6270-high-s...",Type Scripted
2,The Book of Boba Fett,81.0,"Dec 29, 2021",Original Language English,Season 1,7 Episodes,39m,Jon Favreau,"['Action & Adventure', 'Sci-Fi & Fantasy']","['Temuera Morrison', 'Ming-Na Wen', 'Matt Berr...","[<a class=""rounded"" href=""/keyword/801-bounty-...",Type Scripted
3,Big Brother,95.0,"Jan 04, 2021",Original Language Dutch; Flemish,Season 2,24 Episodes,50m,,['Reality'],"['Peter Van De Veire', 'Geraldine Kemper']",[],Type Reality
4,Big Brother 7/7,58.0,"Jan 11, 2021",Original Language French,Season 2,21 Episodes,46m,,['Reality'],"['Marie-Mai', 'Emmanuel Auger', 'Rita Baga', '...","[<a class=""rounded"" href=""/keyword/6562-celebr...",Type Reality
