# **Setting Requirements**

## Importing libraries

In [None]:
# import required libraries
from bs4 import BeautifulSoup as bs
import requests
import random
import pandas as pd
import numpy as np
from io import StringIO
import string
import csv
from google.colab import files
import cvxpy as cp
import matplotlib.pyplot as plt

## Simulate devices for scraping

In [None]:
# simulate devices for scraping
user_agents_list = [
    'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
]

## Define prerequisite data

In [None]:
# define categories and their types with number of actors in each as linked in url
categories = {
    'genres': {
        'adventure': 1601,
        'action': 1301,
        'comedy': 2801,
        'drama': 3701,
        'thriller-or-suspense': 1401,
        'horror': 1301,
        'romantic-comedy': 701,
        'musical': 201
    },
    'creative-types': {
        'contemporary-fiction': 6401,
        'science-fiction': 1201,
        'kids-fiction': 901,
        'historical-fiction': 1500,
        'fantasy': 901,
        'super-hero': 301,
        'dramatization': 1201,
        'factual': 101
    },
    'production-methods': {
        'live-action': 8701,
        'animation-and-live-action': 501,
        'digital-animation': 601,
        'hand-animation': 101,
        'stop-motion-animation': 1,
        'rotoscoping': 1,
        'multiple-production-methods': 1
    },
    'sources': {
        'original-screenplay': 6901,
        'based-on-fictional-book-or-short-story': 2101,
        'based-on-comic-or-graphic-novel': 501,
        'remake': 501,
        'based-on-tv': 401,
        'based-on-real-life-events': 1101,
        'based-on-factual-book-or-article': 401,
        'spin-off': 101
    },
    'mpaa-ratings': {
        'pg-13-(us)': 2801,
        'r-(us)': 3901,
        'pg-(us)': 1901,
        'g-(us)': 301,
        'not-rated-(us)': 2801,
        'gp-(us)': 1,
        'nc-17-(us)': 1,
        'm-pg': 1
    },
    'theatrical-distributors': {
        'walt-disney': 801,
        'warner-bros': 1101,
        'sony-pictures': 901,
        'universal': 1001,
        'paramount-pictures': 801,
        '20th-century-fox': 801,
        'lionsgate': 601,
        'new-line': 201,
        'dreamworks-skg': 1,
        'mgm': 301,
        'miramax': 201,
        'fox-searchlight': 201,
        'focus-features': 301,
        'weinstein-co': 201,
        'summit-entertainment': 1,
        '20th-century-studios': 1,
        'sony-pictures-classics': 401,
        'stx-entertainment': 101,
        'miramax-dimension': 1,
        'relativity': 101,
        'open-road': 101,
        'united-artists': 101,
        'a24': 201,
        'roadside-attractions': 201,
        'newmarket-films': 101
    }
}

distributors = list(categories['theatrical-distributors'].copy().keys())

sources = categories['sources'].copy()

production_methods = categories['production-methods'].copy()

creative_types = categories['creative-types'].copy()

# set time frame of movie releases
start_year = 1995
end_year = 2021


## Creating relevant URLs

In [None]:
# define initial url with placeholders
dates_url = 'https://www.the-numbers.com/movies/year/'

dist_url = 'https://www.the-numbers.com/market/{year}/distributor/{distributorName}'

budget_url = 'https://www.the-numbers.com/movie/budgets/all/'

source_url = 'https://www.the-numbers.com/movies/source/{sourceName}/year/{yearNumber}'

production_method_url = 'https://www.the-numbers.com/movies/production-method/{production_method_name}/year/{yearNumber}'

creative_type_url = 'https://www.the-numbers.com/movies/creative-type/{creative_type_name}/year/{yearNumber}'

actor_url = 'https://www.the-numbers.com/box-office-star-records/domestic/{category}/{category_type}-leading-stars/{rank}'

director_url = 'https://www.the-numbers.com/box-office-star-records/domestic/lifetime-specific-technical-role/director/{rank}'

movie_url = 'https://www.the-numbers.com/movie/{movie_name}#tab=cast-and-crew'


# **Scraping Release Date**

In [None]:

date_df = pd.DataFrame()

try:

    for i in range(start_year, end_year):

        current_url = f'{dates_url}{i}'

        page = requests.get(current_url, headers = {'User-Agent': random.choice(user_agents_list)})

        soup = bs(page.text, 'html.parser')

        df_yearly = pd.read_html(page.text)[0]
        date_df = pd.concat([date_df, df_yearly], ignore_index = True)


except Exception as e:
    print(e)

date_df = date_df.drop(date_df[date_df['Release Type'] == 'Video'].index)
date_df = date_df.drop(date_df[date_df['Release Type'].isnull()].index)
date_df = date_df.drop(columns = ['Trailer', 'Revenue to Date', 'Release Type','Genre'])
date_df = date_df.reset_index(drop = True)

variable = 0

for i in range(date_df.shape[0]):

  if date_df.iloc[i]['Release Date'] == date_df.iloc[i]['Movie']:
      variable = date_df.iloc[i]['Release Date']

  if date_df.iloc[i]['Release Date'] != date_df.iloc[i]['Movie']:
    date_df.at[i,'Release Date'] = variable

date_df = date_df[date_df['Release Date'] != date_df['Movie']]

date_df

Unnamed: 0,Release Date,Movie
1,"January, 1995",Houseguest
2,"January, 1995",Showgirls
3,"January, 1995",Higher Learning
4,"January, 1995",Far from Home: The Adventures of Yell…
5,"January, 1995",The Surgeon
...,...,...
33048,"December, 2020",Mult vs Kino Vypusk No. 121
33049,"December, 2020",Ramprasad Ki Tehrvi
33050,"December, 2020",Seonsyain paemilli
33051,"December, 2020",Song Ni Yi Duo Xiao Hong Hua


# **Scraping Distributor, Genre, and Rating**

In [None]:

dist_df = pd.DataFrame()
try:

  for i in range(start_year, end_year):
    for j in distributors:

        current_url = dist_url.format(year = i, distributorName =j)
        page = requests.get(current_url, headers = {'User-Agent': random.choice(user_agents_list)})
        soup = bs(StringIO(page.text), 'html.parser')
        df_yearly = pd.read_html(StringIO(page.text), attrs={"class":None})[0]
        dist_df = pd.concat([dist_df, df_yearly], ignore_index = True)
        dist_df = dist_df.drop(columns = ['{year} Gross'.format(year = i), 'Tickets Sold','Rank','Release Date'])

except Exception as e:
    print(e)

dist_df['Distributor'] = ""

index = 0

for j in range(dist_df.shape[0]):

    if dist_df.loc[j,'Movie']=="Total Tickets Sold" and dist_df.loc[j,'Genre']=="Total Tickets Sold":
      index += 1

    if dist_df.iloc[j]['Movie'] != dist_df.iloc[j]['Genre']:
      dist_df.loc[j,'Distributor'] = distributors[index]

    if index == len(distributors) - 1:
      index = 0

dist_df = dist_df[dist_df['Movie'] != dist_df['Genre']]
dist_df

Unnamed: 0,Movie,Genre,MPAA Rating,Distributor
0,Toy Story,Adventure,G,walt-disney
1,Pocahontas,Adventure,G,walt-disney
2,Crimson Tide,Action,R,walt-disney
3,Dangerous Minds,Drama,R,walt-disney
4,While You Were Sleeping,Romantic Comedy,PG,walt-disney
...,...,...,...,...
7912,The Last Full Measure,Drama,R,walt-disney
7913,Words on Bathroom Walls,Drama,PG-13,walt-disney
7914,Pinocchio,Adventure,PG-13,walt-disney
7915,Judy,Drama,PG-13,walt-disney


# **Scraping Budget**

In [None]:

budget_df = pd.DataFrame()
current_rank = 1
max_rank = 6401

try:

    while current_rank <= max_rank:

        current_url = f'{budget_url}{current_rank}'

        page = requests.get(current_url, headers = {'User-Agent': random.choice(user_agents_list)})

        soup = bs(page.text, 'html.parser')

        page_df = pd.read_html(page.text)[0]

        budget_df = pd.concat([budget_df, page_df], ignore_index = True)

        current_rank += 100

except Exception as e:
    print(e)

budget_df = budget_df.drop(budget_df[budget_df['Release Date'] == 'Unknown'].index)
budget_df = budget_df.drop(columns = ['Unnamed: 0', 'Worldwide Gross', 'Release Date'])
budget_df = budget_df.reset_index(drop = True)

budget_df


Unnamed: 0,Movie,Production Budget,Domestic Gross
0,Avatar: The Way of Water,"$460,000,000","$684,075,767"
1,Avengers: Endgame,"$400,000,000","$858,373,000"
2,Pirates of the Caribbean: On Stranger Tides,"$379,000,000","$241,071,802"
3,Avengers: Age of Ultron,"$365,000,000","$459,005,868"
4,Fast X,"$340,000,000","$145,960,660"
...,...,...,...
6328,Following,"$6,000","$48,482"
6329,Return to the Land of Wonders,"$5,000","$1,338"
6330,A Plague So Pleasant,"$1,400",$0
6331,My Date With Drew,"$1,100","$181,041"


# **Scraping Source**

In [None]:

def scrape_source(sourceName):

  source_df = pd.DataFrame()

  try:

    for i in range(start_year,end_year):

      current_url = source_url.format(yearNumber = i, sourceName =sourceName)
      page = requests.get(current_url, headers = {'User-Agent': random.choice(user_agents_list)})
      soup = bs(StringIO(page.text), 'html.parser')
      df_yearly = pd.read_html(StringIO(page.text))[0]
      source_df=pd.concat([source_df, df_yearly], ignore_index = True)

    source_df = source_df[source_df['Movie'] != source_df['Genre']]
    source_df = source_df.drop(source_df[source_df['Movie'].isnull()].index)
    source_df = source_df.drop(columns = ['Trailer', 'Revenue to Date', 'Release Type','Genre', 'Release Date'])
    source_df = source_df.reset_index(drop = True)

    source_df['Source'] = sourceName

    return source_df

  except Exception as e:
      print(e)

for j in sources.keys():
    sources[j] = scrape_source(j)

sources


{'original-screenplay':                                         Movie               Source
 0                                   Sorceress  original-screenplay
 1                                  Houseguest  original-screenplay
 2                                   Showgirls  original-screenplay
 3                             Higher Learning  original-screenplay
 4      Far from Home: The Adventures of Yell…  original-screenplay
 ...                                       ...                  ...
 20676                    The 100 Candles Game  original-screenplay
 20677       Horse Julius and Big Horse Racing  original-screenplay
 20678                     Ramprasad Ki Tehrvi  original-screenplay
 20679            Song Ni Yi Duo Xiao Hong Hua  original-screenplay
 20680                                   Alone  original-screenplay
 
 [20681 rows x 2 columns],
 'based-on-fictional-book-or-short-story':                                       Movie  \
 0                  The Secret of Roan Ini

## Combining Sources

In [None]:

source_all_df = pd.DataFrame()

for i in sources.keys():
  source_all_df = pd.concat([source_all_df, sources[i]], ignore_index = True)

source_all_df


Unnamed: 0,Movie,Source
0,Sorceress,original-screenplay
1,Houseguest,original-screenplay
2,Showgirls,original-screenplay
3,Higher Learning,original-screenplay
4,Far from Home: The Adventures of Yell…,original-screenplay
...,...,...
31839,Beloved Beast,spin-off
31840,The Jesus Rolls,spin-off
31841,American Pie Presents: Girls’ Rules,spin-off
31842,Dragons: Rescue Riders: Huttsgalor Ho…,spin-off


# **Scraping Production Method**

In [None]:

def scrape_production_method(production_method_name):

  production_method_df = pd.DataFrame()

  try:

    for i in range(start_year,end_year):

      current_url = production_method_url.format(yearNumber = i, production_method_name = production_method_name)
      page = requests.get(current_url, headers = {'User-Agent': random.choice(user_agents_list)})
      soup = bs(StringIO(page.text), 'html.parser')
      df_yearly = pd.read_html(StringIO(page.text))[0]
      production_method_df=pd.concat([production_method_df, df_yearly], ignore_index = True)

    production_method_df = production_method_df[production_method_df['Movie'] != production_method_df['Genre']]
    production_method_df = production_method_df.drop(production_method_df[production_method_df['Movie'].isnull()].index)
    production_method_df = production_method_df.drop(columns = ['Trailer', 'Revenue to Date', 'Release Type','Genre', 'Release Date'])
    production_method_df = production_method_df.reset_index(drop = True)

    production_method_df['Production Method'] = production_method_name

    return production_method_df

  except Exception as e:
      print(e)

for j in production_methods.keys():
    production_methods[j] = scrape_production_method(j)

production_methods


{'live-action':                                         Movie Production Method
 0                                   Sorceress       live-action
 1                                  Houseguest       live-action
 2                                   Showgirls       live-action
 3                             Higher Learning       live-action
 4      Far from Home: The Adventures of Yell…       live-action
 ...                                       ...               ...
 32847                      Seonsyain paemilli       live-action
 32848            Song Ni Yi Duo Xiao Hong Hua       live-action
 32849             Descubriendo a José Padilla       live-action
 32850                 The Forgotten Battalion       live-action
 32851                                   Alone       live-action
 
 [32852 rows x 2 columns],
 'animation-and-live-action':                                       Movie          Production Method
 0                               Dragonheart  animation-and-live-action
 1 

## Combining Production Methods

In [None]:

production_method_all_df = pd.DataFrame()

for i in production_methods.keys():
  production_method_all_df = pd.concat([production_method_all_df, production_methods[i]], ignore_index = True)

production_method_all_df


Unnamed: 0,Movie,Production Method
0,Sorceress,live-action
1,Houseguest,live-action
2,Showgirls,live-action
3,Higher Learning,live-action
4,Far from Home: The Adventures of Yell…,live-action
...,...,...
35132,The 7th Silk Road International Film …,multiple-production-methods
35133,Italian film masters 2020,multiple-production-methods
35134,Deathcember,multiple-production-methods
35135,The 4th Japan New Film Exhibition,multiple-production-methods


# **Scraping Creative Type**

In [None]:

def scrape_creative_type(creative_type_name):

  creative_type_df = pd.DataFrame()

  try:

    for i in range(start_year,end_year):

      current_url = creative_type_url.format(yearNumber = i, creative_type_name = creative_type_name)
      page = requests.get(current_url, headers = {'User-Agent': random.choice(user_agents_list)})
      soup = bs(StringIO(page.text), 'html.parser')
      df_yearly = pd.read_html(StringIO(page.text))[0]
      creative_type_df=pd.concat([creative_type_df, df_yearly], ignore_index = True)

    creative_type_df = creative_type_df[creative_type_df['Movie'] != creative_type_df['Genre']]
    creative_type_df = creative_type_df.drop(creative_type_df[creative_type_df['Movie'].isnull()].index)
    creative_type_df = creative_type_df.drop(columns = ['Trailer', 'Revenue to Date', 'Release Type','Genre', 'Release Date'])
    creative_type_df = creative_type_df.reset_index(drop = True)

    creative_type_df['Creative Type'] = creative_type_name

    return creative_type_df

  except Exception as e:
      print(e)

for j in creative_types.keys():
    creative_types[j] = scrape_creative_type(j)

creative_types


{'contemporary-fiction':                               Movie         Creative Type
 0                        Houseguest  contemporary-fiction
 1                         Showgirls  contemporary-fiction
 2                   Higher Learning  contemporary-fiction
 3                            Baasha  contemporary-fiction
 4      Amityville: A New Generation  contemporary-fiction
 ...                             ...                   ...
 17915          The 100 Candles Game  contemporary-fiction
 17916           Ramprasad Ki Tehrvi  contemporary-fiction
 17917            Seonsyain paemilli  contemporary-fiction
 17918  Song Ni Yi Duo Xiao Hong Hua  contemporary-fiction
 17919                         Alone  contemporary-fiction
 
 [17920 rows x 2 columns],
 'science-fiction':                                        Movie    Creative Type
 0                                  Tank Girl  science-fiction
 1                       Jinzô ningen Hakaidâ  science-fiction
 2                      Village

## Combining Creative Types

In [None]:

creative_type_all_df = pd.DataFrame()

for i in creative_types.keys():
  creative_type_all_df = pd.concat([creative_type_all_df, creative_types[i]], ignore_index = True)

creative_type_all_df


Unnamed: 0,Movie,Creative Type
0,Houseguest,contemporary-fiction
1,Showgirls,contemporary-fiction
2,Higher Learning,contemporary-fiction
3,Baasha,contemporary-fiction
4,Amityville: A New Generation,contemporary-fiction
...,...,...
33071,A Mind Like Still Water,factual
33072,Stevie Nicks: Broadcasting Live,factual
33073,Yearly Departed,factual
33074,Descubriendo a José Padilla,factual


# **Generating Actor and Director Datasets**

## Generating values

In [None]:
# add column of values for given dataframe
def generate_values(star_df, alpha):

  # create empty column to be filled
  star_df['Value'] = 0.0

  # iterate over dataframe indices
  for i in range(len(star_df)):

    # convert average earnings string into float
    star_df.loc[i, 'Average'] = float(star_df.loc[i, 'Average'].replace("$", "").replace(",", ""))

    average_box_office = star_df.loc[i, 'Average']
    num_movies = star_df.loc[i, 'Movies']

    # calculate values based on average earnings and percent cut
    star_df.loc[i, 'Value'] = np.log(average_box_office * (num_movies ** alpha))

  return star_df

## Scraping Actors

In [None]:
# extract actors from website given categories and types
def scrape_actors(category_dict, chosen_category, chosen_type):

  # define dataframe for storage
  actor_df = pd.DataFrame()

  # initialize incremental rank for pagination
  current_rank = 1

  try:

    # iterate through ranks
    while current_rank <= category_dict[chosen_type]:

      # update url for current iteration
      current_url = actor_url.format(category=chosen_category, category_type=chosen_type, rank=current_rank)

      # retrieve webpage
      page = requests.get(current_url, headers={'User-Agent': random.choice(user_agents_list)})

      # creating parsing entity
      soup = bs(page.text, 'html.parser')

      # extract main table of data
      page_df = pd.read_html(StringIO(page.text), attrs={"style": None})[0]

      # convert scraped table into dataframe
      actor_df = pd.concat([actor_df, page_df], ignore_index=True)

      # increment rank for next page
      current_rank += 100

  except Exception as e:
    print(e)

  # return dataframe with values
  return generate_values(actor_df, 2)

### Collecting all actors in one dataset

In [None]:
actor_data = categories.copy()

for i in categories.keys():
    for j in categories[i]:

      actor_data[i][j] = scrape_actors(categories[i], i, j)
      print(actor_data[i][j])

actor_data


No tables found
      Rank                       Name Domestic Box Office  Movies  \
0        1           Daniel Radcliffe      $2,498,074,626      10   
1        2                Johnny Depp      $2,456,795,398      13   
2        3              Harrison Ford      $2,446,475,667      11   
3        4               Rupert Grint      $2,391,615,589       9   
4        5                Emma Watson      $2,391,615,589       8   
...    ...                        ...                 ...     ...   
1549  1550           Pal Sverre Hagen              $1,369       1   
1550  1551  Julian Rasmussen Podolski              $1,369       1   
1551  1552        Nicolai Cleve Broca              $1,369       1   
1552  1553            Bjorn Sundquist              $1,369       1   
1553  1554            Julian Richings                $584       1   

          Average      Value  
0     249807463.0  23.941371  
1     188984261.0  24.187073  
2     222406879.0  24.015810  
3     265735065.0  23.792460  


{'genres': {'adventure':       Rank                       Name Domestic Box Office  Movies  \
  0        1           Daniel Radcliffe      $2,498,074,626      10   
  1        2                Johnny Depp      $2,456,795,398      13   
  2        3              Harrison Ford      $2,446,475,667      11   
  3        4               Rupert Grint      $2,391,615,589       9   
  4        5                Emma Watson      $2,391,615,589       8   
  ...    ...                        ...                 ...     ...   
  1549  1550           Pal Sverre Hagen              $1,369       1   
  1550  1551  Julian Rasmussen Podolski              $1,369       1   
  1551  1552        Nicolai Cleve Broca              $1,369       1   
  1552  1553            Bjorn Sundquist              $1,369       1   
  1553  1554            Julian Richings                $584       1   
  
            Average      Value  
  0     249807463.0  23.941371  
  1     188984261.0  24.187073  
  2     222406879.0  24

## Scraping Directors

In [None]:
# define dataframe for storage
director_df = pd.DataFrame()

# initialize incremental rank for pagination
current_rank = 1
max_rank = 7401

try:

  # iterate through ranks
  while current_rank <= max_rank:

    # update url for current iteration
    current_url = director_url.format(rank=current_rank)

    # retrieve webpage
    page = requests.get(current_url, headers={'User-Agent': random.choice(user_agents_list)})

    # creating parsing entity
    soup = bs(page.text, 'html.parser')

    # extract main table of data
    page_df = pd.read_html(StringIO(page.text), attrs={"style": None})[0]

    # convert scraped table into dataframe
    director_df = pd.concat([director_df, page_df], ignore_index=True)

    # increment rank for next page
    current_rank += 100

except Exception as e:
  print(e)

director_df=generate_values(director_df, 2)
director_df




KeyboardInterrupt: ignored

# **Creating final movie dataset**

In [None]:

def inner_join(df_1, df_2):
  return pd.merge(df_1, df_2, on='Movie', how='inner')

movies_df = inner_join(creative_type_all_df, inner_join(production_method_all_df, source_all_df))
movies_df = inner_join(dist_df, inner_join(budget_df, inner_join(date_df, movies_df)))

movies_df = movies_df.drop_duplicates(subset=['Movie'], keep='first')

movies_df = movies_df[movies_df.Genre != 'Reality']
movies_df = movies_df[movies_df.Genre != 'Western']
movies_df = movies_df[movies_df.Genre != 'Documentary']
movies_df = movies_df[movies_df.Genre != 'Concert/Perfor…']
movies_df = movies_df[movies_df.Genre != 'Black Comedy']

movies_df = movies_df.reset_index(drop = True)


#domestic=np.where(movies_df.loc[:,'Domestic Gross'])
#print(domestic)

movies_df

Unnamed: 0,Movie,Genre,MPAA Rating,Distributor,Production Budget,Domestic Gross,Release Date,Creative Type,Production Method,Source
0,Toy Story,Adventure,G,walt-disney,"$30,000,000","$192,523,233","November, 1995",kids-fiction,digital-animation,original-screenplay
1,Pocahontas,Adventure,G,walt-disney,"$55,000,000","$141,579,773","June, 1995",historical-fiction,hand-animation,based-on-real-life-events
2,Crimson Tide,Action,R,walt-disney,"$55,000,000","$91,387,195","May, 1995",contemporary-fiction,live-action,original-screenplay
3,Dangerous Minds,Drama,R,walt-disney,"$23,000,000","$84,919,401","August, 1995",dramatization,live-action,based-on-factual-book-or-article
4,Man of the House,Comedy,PG,walt-disney,"$50,000,000","$19,699,706","March, 1995",contemporary-fiction,live-action,original-screenplay
...,...,...,...,...,...,...,...,...,...,...
3049,Brahms: The Boy II,Horror,PG-13,miramax-dimension,"$10,000,000","$12,611,536","February, 2020",contemporary-fiction,live-action,original-screenplay
3050,Honest Thief,Thriller/Suspense,PG-13,united-artists,"$30,000,000","$14,163,574","October, 2020",contemporary-fiction,live-action,original-screenplay
3051,Bill & Ted Face the Music,Comedy,PG-13,a24,"$25,000,000","$3,439,660","August, 2020",science-fiction,live-action,original-screenplay
3052,The Last Full Measure,Drama,R,walt-disney,"$20,000,000","$2,949,212","January, 2020",dramatization,live-action,based-on-real-life-events


In [None]:

for i in range(movies_df.shape[0]):
  movies_df.loc[i,'Production Budget']=float(movies_df.loc[i,'Production Budget'].replace("$", "").replace(",", ""))
  movies_df.loc[i,'Domestic Gross']=float(movies_df.loc[i,'Domestic Gross'].replace("$", "").replace(",", ""))

movies_df=movies_df[movies_df.loc[:,'Domestic Gross']!="$0"]
movies_df=movies_df[movies_df.loc[:,'MPAA Rating']!="Not Rated"]
movies_df=movies_df[movies_df.loc[:,'Production Budget']>1600000]
movies_df=movies_df[movies_df.loc[:,'Domestic Gross']>1500000]

movies_df = movies_df.reset_index(drop = True)

movies_df

Unnamed: 0,Movie,Genre,MPAA Rating,Distributor,Production Budget,Domestic Gross,Release Date,Creative Type,Production Method,Source
0,Toy Story,Adventure,G,walt-disney,30000000.0,192523233.0,"November, 1995",kids-fiction,digital-animation,original-screenplay
1,Pocahontas,Adventure,G,walt-disney,55000000.0,141579773.0,"June, 1995",historical-fiction,hand-animation,based-on-real-life-events
2,Crimson Tide,Action,R,walt-disney,55000000.0,91387195.0,"May, 1995",contemporary-fiction,live-action,original-screenplay
3,Dangerous Minds,Drama,R,walt-disney,23000000.0,84919401.0,"August, 1995",dramatization,live-action,based-on-factual-book-or-article
4,Man of the House,Comedy,PG,walt-disney,50000000.0,19699706.0,"March, 1995",contemporary-fiction,live-action,original-screenplay
...,...,...,...,...,...,...,...,...,...,...
2804,Brahms: The Boy II,Horror,PG-13,miramax-dimension,10000000.0,12611536.0,"February, 2020",contemporary-fiction,live-action,original-screenplay
2805,Honest Thief,Thriller/Suspense,PG-13,united-artists,30000000.0,14163574.0,"October, 2020",contemporary-fiction,live-action,original-screenplay
2806,Bill & Ted Face the Music,Comedy,PG-13,a24,25000000.0,3439660.0,"August, 2020",science-fiction,live-action,original-screenplay
2807,The Last Full Measure,Drama,R,walt-disney,20000000.0,2949212.0,"January, 2020",dramatization,live-action,based-on-real-life-events


## Scraping existing movie casts and crews

In [None]:

def extract_cast_crew(cast_crew_df,is_cast):
  try:

    for i in range(movies_df.shape[0]):

      movie_name = movies_df.loc[i, 'Movie']
      movie_name = movie_name.translate(str.maketrans('', '', string.punctuation))

      if "The" in movie_name[0:4]:
        the = movie_name[0:4]
        no_the = movie_name[4:]
        movie_name = (no_the + " " + the)

      movie_name = movie_name.replace(' ', '-')
      year = movies_df.loc[i, 'Release Date']
      year = ''.join(i for i in year if i.isdigit())

      current_url = movie_url.format(movie_name = movie_name, year = year)

      page = requests.get(current_url, headers = {'User-Agent': random.choice(user_agents_list)})

      soup = bs(StringIO(page.text), 'html.parser')

      try:

        tables=pd.read_html(StringIO(page.text), attrs={"align":"center"})
        if is_cast:
          table_num=0
        else:
          table_num=len(tables)-1

        sub_df=tables[table_num]
        sub_df['Movie'] = ''
        print(year, movie_name, sub_df)

        for j in range(sub_df.shape[0]):
          sub_df.loc[j, 'Movie'] = movies_df.loc[i, 'Movie']
        cast_crew_df = pd.concat([cast_crew_df, sub_df], ignore_index = True)


      except ValueError as e:
        movies_df.drop([i], axis = 0, inplace = True)

  except Exception as e:
    print(e)

  return cast_crew_df


In [None]:
cast_df=pd.DataFrame()

cast_df=extract_cast_crew(cast_df,True)
movies_df = movies_df.reset_index(drop = True)
cast_df = cast_df.drop(columns = [1,2]).rename(columns = {0: "Name"})
cast_df['Value'] = 0.0

cast_df

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0  Donald Pleasance NaN  Dr. Samuel "Sam" Loomis      
1  Jamie Lee Curtis NaN            Laurie Strode      
2007 Mist-The-                    0   1              2 Movie
0        Thomas Jane NaN  David Drayton      
1  Marcia Gay Harden NaN   Mrs. Carmody      
2     Andre Braugher NaN   Brent Norton      
2007 Lars-and-the-Real-Girl               0   1     2 Movie
0  Ryan Gosling NaN  Lars      
2007 Whos-Your-Caddy                0   1         2 Movie
0  Jeffrey Jones NaN  Cummings      
1        Big Boi NaN    C-Note      
2007 Rescue-Dawn                 0   1               2 Movie
0  Christian Bale NaN  Dieter Dengler      
2007 Blood-and-Chocolate                 0   1       2 Movie
0  Agnes Bruckner NaN  Vivian      
2007 Factory-Girl                0   1              2 Movie
0  Sienna Miller NaN  Edie Sedgwick      
2007 No-Country-for-Old-Men                0   1              2 Movie
0  Javier Bardem NaN  Anton 

Unnamed: 0,Name,Movie,Value
0,Tom Hanks,Toy Story,0.0
1,Tim Allen,Toy Story,0.0
2,Mel Gibson,Pocahontas,0.0
3,Denzel Washington,Crimson Tide,0.0
4,Gene Hackman,Crimson Tide,0.0
...,...,...,...
8323,Samuel L. Jackson,The Last Full Measure,0.0
8324,Peter Fonda,The Last Full Measure,0.0
8325,Jeremy Irvine,The Last Full Measure,0.0
8326,Charlie Plummer,Words on Bathroom Walls,0.0


## Renaming entries for compatibility

In [None]:
movies_df.rename(columns = {
  "Genre":"genres",
  "MPAA Rating":"mpaa-ratings",
  "Creative Type":"creative-types",
  "Production Method":"production-methods",
  "Source":"sources",
  "Distributor":"theatrical-distributors"
}, inplace = True)

genres_rename = {
  "Adventure":"adventure",
  "Action":"action",
  "Comedy":"comedy",
  "Drama":"drama",
  "Thriller/Suspense":"thriller-or-suspense",
  "Horror":"horror",
  'Romantic Comedy': 'romantic-comedy',
  'Musical': 'musical'
}

ratings_rename = {
  'PG-13': 'pg-13-(us)',
  'R': 'r-(us)',
  'PG': 'pg-(us)',
  'G': 'g-(us)',
  'Not Rated': 'not-rated-(us)',
  'GP': 'gp-(us)',
  'NC-17': 'nc-17-(us)',
  'M/PG': 'm-pg'
}

movies_df['genres'].replace(genres_rename, inplace = True)
movies_df['mpaa-ratings'].replace(ratings_rename, inplace = True)

## Generating cast values

# New Section

In [None]:
cast_df['Value'] = 0.0

movie_categories = list(categories.keys())

actor_vals = []

for i in range(movies_df.shape[0]):
  current_movie = movies_df.loc[i, 'Movie']
  cast_size = 0

  while cast_size != len(np.where(cast_df == current_movie)[0]):

    current_name = cast_df.loc[np.where(cast_df == current_movie)[0][cast_size - 1], 'Name']
    cast_size += 1
    actor_sum = 0

    for j in movie_categories:
      current_category = movies_df.loc[i,j]


      for k in actor_data[j]:

        name = actor_data[j][k].Name
        value = actor_data[j][k].Value

        if current_category == k:

          a = np.where(name == current_name)[0]
          b = actor_data[j][k].loc[a, 'Value']

          if len(b.values) != 0:
            actor_sum += b.values[0]

          else:
            actor_sum += 0

    actor_vals.append([actor_sum / len(movie_categories), current_name])


for m in range(cast_df.shape[0]):
  cast_df.loc[m,'Value'] = actor_vals[m][0]

actor_value_df=cast_df.copy()

cast_df=cast_df.drop(columns=['Name'])
cast_df=cast_df.groupby(['Movie']).mean()

actor_value_df

  cast_df=cast_df.groupby(['Movie']).mean()


Unnamed: 0,Name,Movie,Value
0,Tom Hanks,Toy Story,23.428941
1,Tim Allen,Toy Story,23.483320
2,Mel Gibson,Pocahontas,20.320583
3,Denzel Washington,Crimson Tide,21.998387
4,Gene Hackman,Crimson Tide,23.785503
...,...,...,...
8323,Samuel L. Jackson,The Last Full Measure,20.582953
8324,Peter Fonda,The Last Full Measure,21.293618
8325,Jeremy Irvine,The Last Full Measure,14.722839
8326,Charlie Plummer,Words on Bathroom Walls,15.207038


In [None]:
cast_df.to_csv('cast_value_df.csv', encoding = 'utf-8-sig')
files.download('cast_value_df.csv')

In [None]:
uploaded = files.upload()
cast_df = pd.read_csv('cast_df.csv')

In [None]:
actor_value_df.to_csv('actor_value_df.csv', encoding = 'utf-8-sig')
files.download('actor_value_df.csv')
actor_value_df

## Generating actor salaries

In [None]:
uploaded = files.upload()
movies_df = pd.read_csv('movies_df.csv')

In [None]:
uploaded = files.upload()
actor_value_df = pd.read_csv('actor_value_df.csv')

In [None]:
for i in range(actor_value_df.shape[0]):
  if "*" in actor_value_df.loc[i,'Name']:
    actor_value_df.loc[i,'Name']=actor_value_df.loc[i,'Name'].replace("*","")
  if "," in actor_value_df.loc[i,'Name']:
    actor_value_df.loc[i,'Name']=actor_value_df.loc[i,'Name'].replace(",","")

actor_value_df

Unnamed: 0,Name,Movie,Value
0,Tom Hanks,Toy Story,23.428941
1,Tim Allen,Toy Story,23.483320
2,Mel Gibson,Pocahontas,20.320583
3,Denzel Washington,Crimson Tide,21.998387
4,Gene Hackman,Crimson Tide,23.785503
...,...,...,...
8323,Samuel L. Jackson,The Last Full Measure,20.582953
8324,Peter Fonda,The Last Full Measure,21.293618
8325,Jeremy Irvine,The Last Full Measure,14.722839
8326,Charlie Plummer,Words on Bathroom Walls,15.207038


In [None]:
actor_value_df = actor_value_df.groupby(['Name']).mean()
actor_value_df

  actor_value_df = actor_value_df.groupby(['Name']).mean()


Unnamed: 0_level_0,Value
Name,Unnamed: 1_level_1
A.J. Buckley,0.000000
A.J. Cook,17.318276
A.J. Langer,18.493972
A.J. Michalka,12.740982
Aaliyah,18.439222
...,...
Zoe Kazan,15.476550
Zoe Saldana,17.825433
Zoey Deutch,17.008436
Zooey Deschanel,11.691876


In [None]:
def generate_salaries(salary_df, percent_cut,is_cast):

  salary_df['Salary'] = 0.0

  salary_df = pd.merge(salary_df, movies_df, on='Movie', how='left')

  for i in range(salary_df.shape[0]):
    if salary_df.loc[i,'Value']>20:
      percent_cut
    salary_df.loc[i, 'Salary'] = salary_df.loc[i, 'Production Budget'] * percent_cut

  salary_df = salary_df[['Name', 'Movie', 'Production Budget', 'Salary']]

  return salary_df

In [None]:
def generate_salaries(salary_df,is_cast):

  percent_budget,percent_gross=0, 0
  salary_df['Salary'] = 0.0

  salary_df = pd.merge(salary_df, movies_df, on='Movie', how='left')

  for i in range(salary_df.shape[0]):

    value=salary_df.loc[i,'Value']
    gross=salary_df.loc[i,'Domestic Gross']
    budget=salary_df.loc[i,'Production Budget']

    if is_cast:

      if value>=22:
        percent_budget=0.0005
        percent_gross=0.00015

      if value<22 and value>20.5:
        percent_budget=0.000015
        percent_gross=0.00001

      if value<=20.5:
        percent_budget=0.00001
        percent_gross=0
    else:

      if value>=22:
        percent_budget=0.00075
        percent_gross=0

      if value<22 and value>20:
        percent_budget=0.00025
        percent_gross=0

      if value>17 and value<=20:
        percent_budget=0.00015
        percent_gross=0
      if value<20:
        percent_budget=0.0001
        percent_gross=0

    salary_df.loc[i, 'Salary'] = gross * percent_gross + budget * percent_budget

  if is_cast:
    salary_df = salary_df[['Name', 'Movie', 'Production Budget', 'Salary','Domestic Gross','Value']]
  else:
    salary_df = salary_df[['Name', 'Movie', 'Production Budget', 'Salary','Domestic Gross','Value']]

  return salary_df


In [None]:
actor_value_df = generate_salaries(actor_value_df, True)
actor_value_df = actor_value_df.drop(columns = ['Production Budget', 'Movie','Domestic Gross'])
actor_value_df = actor_value_df.groupby(['Name']).mean()
actor_value_df

Unnamed: 0_level_0,Salary,Value
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
A.J. Buckley,100.000000,0.000000
A.J. Cook,260.000000,17.318276
A.J. Johnson,,12.698267
A.J. Langer,240.000000,15.686364
A.J. Michalka,17.000000,12.740982
...,...,...
Zoe Saldana,1258.571429,17.963535
Zoey Deutch,176.666667,16.721547
Zooey Deschanel,293.351330,12.113573
Zoë Kravitz,,11.736327


In [None]:
actor_salary_df.to_csv('actor_salary_df.csv', encoding = 'utf-8-sig')
files.download('actor_salary_df.csv')

## Scraping existing movie directors

In [None]:

movie_director_df = pd.DataFrame()

try:

  for i in range(movies_df.shape[0]):

    movie_name = movies_df.loc[i, 'Movie']
    movie_name = movie_name.translate(str.maketrans('', '', string.punctuation))

    if "The" in movie_name[0:4]:
      the = movie_name[0:4]
      no_the = movie_name[4:]
      movie_name = (no_the + " " + the)

    movie_name = movie_name.replace(' ', '-')

    year = movies_df.loc[i, 'Release Date']
    year = ''.join(i for i in year if i.isdigit())

    current_url = movie_url.format(movie_name = movie_name, year = year)

    page = requests.get(current_url, headers = {'User-Agent': random.choice(user_agents_list)})


    soup = bs(StringIO(page.text), 'html.parser')

    try:
      tables=pd.read_html(StringIO(page.text), attrs={"align":"center"})

      df_leads = tables[len(tables)-1]
      df_leads['Movie'] = ''


      for j in range(df_leads.shape[0]):
        df_leads.loc[j, 'Movie'] = movies_df.loc[i, 'Movie']

      movie_director_df = pd.concat([movie_director_df, df_leads], ignore_index = True)

    except ValueError as e:
      movies_df.drop([i], axis = 0, inplace = True)

except Exception as e:
    print(e)

movie_director_df=movie_director_df.drop(columns=[1])

renaming={0:'Name',2:'Role'}
movie_director_df.rename(columns=renaming,inplace=True)
movie_director_df=movie_director_df[movie_director_df.loc[:,'Role']=='Director']
movie_director_df=movie_director_df.reset_index(drop = True)

movie_director_df

In [None]:
movie_director_df=pd.DataFrame()


movie_director_df=extract_cast_crew(movie_director_df,False)

In [None]:

renaming={0:'Name', 2:'Role'}
movie_director_df.rename(columns=renaming,inplace=True)
movie_director_df=movie_director_df[movie_director_df.loc[:,'Role']=='Director']
movie_director_df=movie_director_df.reset_index(drop = True)

movie_director_df

In [None]:
director_df=director_df.drop(columns=['Rank','Domestic Box Office','Average','Movies'])
movie_director_df=movie_director_df.drop(columns=['Role',1])
movie_director_df=pd.merge(movie_director_df, director_df, on='Name', how='inner')

movie_director_df

In [None]:
movie_director_df.to_csv('director_value_df.csv', encoding = 'utf-8-sig')
files.download('director_value_df.csv')

## Generating director salaries

In [None]:
uploaded = files.upload()
director_value_df = pd.read_csv('director_value_df.csv')

Saving director_value_df.csv to director_value_df (3).csv


In [None]:
director_value_df = generate_salaries(director_value_df, False)
#director_value_df = director_value_df.drop(columns = ['Domestic Box Office', 'Movies', 'Average', 'Rank'])

director_value_df=director_value_df.groupby(['Name']).mean()
director_value_df

  director_value_df=director_value_df.groupby(['Name']).mean()


Unnamed: 0_level_0,Salary,Value
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aaron Horvath,2500.000000,20.913431
Aaron Schneider,700.000000,17.130816
Aaron Seltzer,5555.555556,20.753410
Abbas Alibhai Burmawalla,500.000000,14.821294
Abby Kohn,3200.000000,18.396298
...,...,...
Zach Braff,425.000000,19.553406
Zack Snyder,112650.000000,23.565554
Zal Batmanglij,650.000000,15.495468
Zara Hayes,1000.000000,16.427866


In [None]:
uploaded = files.upload()
movie_director_df = pd.read_csv('director_value_df.csv')

Saving director_value_df.csv to director_value_df (2).csv


In [None]:
movies_df = inner_join(movies_df, cast_df).rename(columns = {'Value':'Cast Value'})

movie_director_df=movie_director_df.drop(columns=['Name']).rename(columns = {'Value':'Director Value'})


movies_df = inner_join(movies_df, movie_director_df)

movies_df


Unnamed: 0.1,Movie,genres,mpaa-ratings,theatrical-distributors,Production Budget,Domestic Gross,Release Date,creative-types,production-methods,sources,Cast Value,Unnamed: 0,Director Value
0,Toy Story,adventure,g-(us),walt-disney,30000000.0,192523233.0,"November, 1995",kids-fiction,digital-animation,original-screenplay,23.456131,0,22.565527
1,Crimson Tide,action,r-(us),walt-disney,55000000.0,91387195.0,"May, 1995",contemporary-fiction,live-action,original-screenplay,22.891945,4,23.583815
2,Judge Dredd,action,r-(us),walt-disney,85000000.0,34687912.0,"June, 1995",science-fiction,live-action,based-on-comic-or-graphic-novel,21.993195,11,18.171480
3,The Jungle Book,adventure,pg-(us),walt-disney,175000000.0,364001123.0,"April, 2016",fantasy,animation-and-live-action,based-on-fictional-book-or-short-story,6.342099,12,21.724337
4,The Lion King,adventure,g-(us),walt-disney,260000000.0,543638043.0,"July, 2019",kids-fiction,animation-and-live-action,remake,14.466526,13,21.143090
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2503,The New Mutants,horror,pg-13-(us),sony-pictures-classics,67000000.0,23855569.0,"August, 2020",super-hero,live-action,based-on-comic-or-graphic-novel,14.444625,2383,19.916772
2504,The Gentlemen,action,r-(us),miramax-dimension,22000000.0,36471796.0,"December, 2019",contemporary-fiction,live-action,original-screenplay,17.402846,1056,23.458987
2505,Honest Thief,thriller-or-suspense,pg-13-(us),united-artists,30000000.0,14163574.0,"October, 2020",contemporary-fiction,live-action,original-screenplay,17.565072,2641,18.081902
2506,The Last Full Measure,drama,r-(us),walt-disney,20000000.0,2949212.0,"January, 2020",dramatization,live-action,based-on-real-life-events,18.327338,2642,16.296359


In [None]:
movies_df['Crew Value']=movies_df['Cast Value'] + movies_df['Director Value']

movies_df=movies_df.drop(columns=['Cast Value','Director Value', 'Unnamed: 0'])

movies_df

Unnamed: 0,Movie,genres,mpaa-ratings,theatrical-distributors,Production Budget,Domestic Gross,Release Date,creative-types,production-methods,sources,Crew Value
0,Toy Story,adventure,g-(us),walt-disney,30000000.0,192523233.0,"November, 1995",kids-fiction,digital-animation,original-screenplay,46.021657
1,Crimson Tide,action,r-(us),walt-disney,55000000.0,91387195.0,"May, 1995",contemporary-fiction,live-action,original-screenplay,46.475760
2,Judge Dredd,action,r-(us),walt-disney,85000000.0,34687912.0,"June, 1995",science-fiction,live-action,based-on-comic-or-graphic-novel,40.164674
3,The Jungle Book,adventure,pg-(us),walt-disney,175000000.0,364001123.0,"April, 2016",fantasy,animation-and-live-action,based-on-fictional-book-or-short-story,28.066436
4,The Lion King,adventure,g-(us),walt-disney,260000000.0,543638043.0,"July, 2019",kids-fiction,animation-and-live-action,remake,35.609616
...,...,...,...,...,...,...,...,...,...,...,...
2503,The New Mutants,horror,pg-13-(us),sony-pictures-classics,67000000.0,23855569.0,"August, 2020",super-hero,live-action,based-on-comic-or-graphic-novel,34.361398
2504,The Gentlemen,action,r-(us),miramax-dimension,22000000.0,36471796.0,"December, 2019",contemporary-fiction,live-action,original-screenplay,40.861833
2505,Honest Thief,thriller-or-suspense,pg-13-(us),united-artists,30000000.0,14163574.0,"October, 2020",contemporary-fiction,live-action,original-screenplay,35.646974
2506,The Last Full Measure,drama,r-(us),walt-disney,20000000.0,2949212.0,"January, 2020",dramatization,live-action,based-on-real-life-events,34.623697


In [None]:
movies_df.to_csv('movies_df.csv', encoding = 'utf-8-sig')
files.download('movies_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#uploaded = files.upload()

movie_director_df = pd.read_csv("directors.csv")

movie_director_df

# **Setting up optimization**

## Getting movie specifications

In [None]:

def get_movie_info():

  chosen_categories = []

  budget=float(input(
        f"\nPlease enter a selected budget: "
    ))

  cast_size=int(input(
        f"\nDetermine the cast size: "
    ))

  for i in categories:

    chosen_type = input(
        f"\nFrom these types for the category of {i}:\n{categories[i].keys()}\nEnter your choice: "
    )

    if chosen_type in categories[i].keys():
      chosen_categories.append(actor_data[i][chosen_type])

  return chosen_categories,budget,cast_size



In [None]:
chosen_actor_data,budget,cast_size= get_movie_info()

print(chosen_actor_data)

print("\nbudget\n:",budget)
print("cast size:",cast_size)


Please enter a selected budget: 300000000

Determine the cast size: 10

From these types for the category of genres:
dict_keys(['adventure', 'action', 'comedy', 'drama', 'thriller-or-suspense', 'horror', 'romantic-comedy', 'musical'])
Enter your choice: action

From these types for the category of creative-types:
dict_keys(['contemporary-fiction', 'science-fiction', 'kids-fiction', 'historical-fiction', 'fantasy', 'super-hero', 'dramatization', 'factual'])
Enter your choice: super-hero

From these types for the category of production-methods:
dict_keys(['live-action', 'animation-and-live-action', 'digital-animation', 'hand-animation', 'stop-motion-animation', 'rotoscoping', 'multiple-production-methods'])
Enter your choice: live-action

From these types for the category of sources:
dict_keys(['original-screenplay', 'based-on-fictional-book-or-short-story', 'based-on-comic-or-graphic-novel', 'remake', 'based-on-tv', 'based-on-real-life-events', 'based-on-factual-book-or-article', 'spin

In [None]:
chosen_actor_df = pd.concat(chosen_actor_data, ignore_index = True)


for i in range(chosen_actor_df.shape[0]):
  if "," in chosen_actor_df.loc[i,'Name']:
    chosen_actor_df.loc[i,'Name']=chosen_actor_df.loc[i,'Name'].replace(",","")


chosen_actor_df = chosen_actor_df.drop(columns = ['Domestic Box Office', 'Movies', 'Average', 'Rank'])
chosen_actor_df = chosen_actor_df.groupby(['Name']).mean()



chosen_actor_df


Unnamed: 0_level_0,Value
Name,Unnamed: 1_level_1
A.D. Miles,12.584858
A.J. Benza,10.233618
A.J. Cook,17.663457
A.J. Johnson,18.429185
A.J. Michalka,15.427808
...,...
in-pyo Cha,10.526963
Édgar Flores,14.746361
Éric Bruneau,9.147933
Özgü Namal,10.605322


In [None]:
actor_value_df

In [None]:
actor_opt_input = pd.merge(actor_value_df,chosen_actor_df, on='Name', how='inner')
#actor_opt_input=generate_salaries(actor_value_df,True)

actor_opt_input

Unnamed: 0_level_0,Salary,Value_x,Value_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A.J. Cook,260.000000,17.318276,17.663457
A.J. Johnson,,12.698267,18.429185
A.J. Michalka,17.000000,12.740982,15.427808
Aaliyah,250.000000,16.677946,18.403327
Aaron Eckhart,540.119977,17.278333,20.517718
...,...,...,...
Zoe Kazan,50.000000,15.476550,17.595614
Zoe Saldana,1258.571429,17.963535,23.886123
Zoey Deutch,176.666667,16.721547,19.215887
Zooey Deschanel,293.351330,12.113573,18.204120


In [None]:
uploaded = files.upload()
director_value_df = pd.read_csv('director_value_df.csv')

In [None]:
director_value_df = generate_salaries(director_value_df, False)
director_value_df = director_value_df.drop(columns = ['Production Budget', 'Movie'])
director_value_df = director_value_df.groupby(['Name']).mean()
director_value_df

# **Optimization**

In [None]:
num_actors = actor_opt_input.shape[0]

x = cp.Variable(num_actors,boolean=True)

actor_util=actor_opt_input['Value']
salaries=actor_opt_input['Salary']

actor_util

obj=cp.Maximize((x@actor_util)/cast_size)

cons=[]

cons.append(cp.sum(x)<=cast_size)

cons.append(x@salaries<=(0.15*budget))

prob = cp.Problem(obj,cons)

prob.solve(verbose=False)  # verbose = True allows you to see the solution process
print('The objective value is {}.'.format(obj.value))

x_np_array = x.value.astype(int)  # extract the x values as a np array
x_values = pd.Series(x_np_array, index =x)  # convert the np array to a Dataframe
selected = x_values[x_values == 1].index.values  # get assignments

# Print outputs
print(np.where(x_values==1)[0])

print(actor_opt_input.iloc[np.where(x_values==1)[0]])


KeyError: ignored