## lets fetch and format the data

In [163]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [164]:
jonny_url = 'https://www.rottentomatoes.com/celebrity/oprah_noodlemantra'
amber_url = 'https://www.rottentomatoes.com/celebrity/amber_heard'
mojo_base_url = 'https://www.boxofficemojo.com/'

In [165]:
def fetch_box_office_mojo(title):
    """
    
    """
    # generate url to the movie statistics
    box_office_mojo_url = f'{mojo_base_url}search/?q={title}'

    try:
        result = requests.get(box_office_mojo_url.replace('//','/').replace(':/','://')).content
        time.sleep(20)
    except:
        return {'Domestic': '$0', 'International': '$0', 'Worldwide': '$0'}
    
    soup = BeautifulSoup(result)
    all_movies = soup.find_all("div", {"class": "a-fixed-left-grid-inner"})

    try:
        first_option = all_movies[0].find("div", {"class": "a-fixed-left-grid-col"})
    except:
        return {'Domestic': '$0', 'International': '$0', 'Worldwide': '$0'}
    
    link_to_movie = first_option.find("a")["href"]
    movie_url = mojo_base_url+link_to_movie
    
    # fetch movie statistics
    try:
        soup = BeautifulSoup(requests.get(movie_url.replace('//','/').replace(':/','://')).content)
        time.sleep(20)
    except:
        return {'Domestic': '$0', 'International': '$0', 'Worldwide': '$0'}
    
    
    performance_summaries = soup.find("div", {"class": "mojo-performance-summary-table"}).find_all("div", {"class": "a-section"})
    performance = {}
    for summary in performance_summaries:
        perfomance_type = summary.find("span", {"class": "a-size-small"}).get_text().replace('\n','').replace(' ','').split('(')[0]
        try:
            money = summary.find("span", {"class": "money"}).get_text()
        except:
            money = '$0'
        performance[perfomance_type] = money
    return performance

In [166]:
def get_and_transform(url):
    """
    This function sends a get request to rotten tomatoes server
    to get the page containing data about its actors.

    Param url:
        Url to the page of the actor

    Result df:
        pandas dataframe containing data about movies related to the actor
    """
    result = requests.get(url.replace('//','/').replace(':/','://')).content
    time.sleep(20)
    soup = BeautifulSoup(result)
    tables = soup.find_all('table')
    
    # getting movies
    movies_soup = tables[0].find('tbody')
    movies_rows = movies_soup.find_all('tr')
    data1 = [[row['data-audiencescore'], row['data-boxoffice'], row['data-title'], row['data-tomatometer'], row['data-year'], 'movie'] for row in movies_rows]

    # geting tv shows
    tvs_soup = tables[1].find('tbody')
    tvs_rows = tvs_soup.find_all('tr')
    data2 = [[row['data-audiencescore'], '', row['data-title'], row['data-tomatometer'], row['data-appearance-year'], 'tv'] for row in tvs_rows]
    
    # combining the two datasets and creating a pandas dataframe
    data = data1+data2
    columns = ['audiencescore', 'boxoffice', 'title', 'tomatometer', 'year', 'type']
    df = pd.DataFrame(data, columns=columns)
    
    # some data cleaning
    df['year'] = df['year'].apply(lambda x: str(x).replace('[','').replace(']','').split('-')[0])
    df = df[df['tomatometer']!=0]
    df = df[df['tomatometer']!='0']
    
    #get box office mojo data
    df['performance'] = df['title'].apply(fetch_box_office_mojo)
    expanded_df = df['performance'].apply(pd.Series)
    df = pd.concat([df.drop(['performance'], axis=1), df['performance'].apply(pd.Series)], axis=1)
    return df

In [167]:
jonny_df = get_and_transform(jonny_url)
jonny_df.to_csv('jonny.csv', index=False)

In [168]:
amber_df = get_and_transform(amber_url)
amber_df.to_csv('amber.csv', index=False)