In [1]:
import pandas as pd
import requests
import numpy as np

from bs4 import BeautifulSoup

def get_movie_details(title):
    # Use the OMDB API to get movie details
    omdb_api_key = "8c00594b"  # Get your free API key from http://www.omdbapi.com/
    omdb_url = f"http://www.omdbapi.com/?apikey={omdb_api_key}&t={title}&type=movie"
    
    response = requests.get(omdb_url)
    data_json = response.json()

    if data_json.get('Response') == 'True':
        return {
            'genres': data_json.get('Genre', '').split(', '),
            'director_name': data_json.get('Director'),
            'actor_1_name': data_json.get('Actors', '').split(',')[0],
            'actor_2_name': data_json.get('Actors', '').split(',')[1] if ',' in data_json.get('Actors', '') else None,
            'actor_3_name': data_json.get('Actors', '').split(',')[2] if ',' in data_json.get('Actors', '') else None
        }
    else:
        return None

# Extracting features of 2020 movies from Wikipedia
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"
source = requests.get(link).text
soup = BeautifulSoup(source, 'html.parser')
tables = soup.find_all('table', class_='wikitable sortable')
df_list = [pd.read_html(str(table))[0] for table in tables]
df_2020 = pd.concat(df_list, ignore_index=True)

df_2020 = df_2020[['Title', 'Cast and crew']].copy()

# Applying API function to get movie details
df_2020['movie_details'] = df_2020['Title'].apply(lambda x: get_movie_details(str(x)))

# Expanding the movie_details dictionary into columns
df_2020 = pd.concat([df_2020, df_2020['movie_details'].apply(pd.Series)], axis=1)

# Convert lists to strings
df_2020['actor_1_name'] = df_2020['actor_1_name'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df_2020['actor_2_name'] = df_2020['actor_2_name'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df_2020['actor_3_name'] = df_2020['actor_3_name'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df_2020['director_name'] = df_2020['director_name'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
df_2020['genres'] = df_2020['genres'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Dropping unnecessary columns
df_2020.drop(['movie_details'], axis=1, inplace=True)

# Handling missing values
df_2020['actor_2_name'].fillna(value=np.NaN, inplace=True)
df_2020['actor_3_name'].fillna(value=np.NaN, inplace=True)

# Renaming columns
df_2020.rename(columns={'Title': 'movie_title', 'Cast and crew': 'comb'}, inplace=True)

# Lowercasing movie titles
df_2020['movie_title'] = df_2020['movie_title'].str.lower()

# Creating a combined column
df_2020['comb'] = df_2020['actor_1_name'] + ' ' + df_2020['actor_2_name'] + ' ' + df_2020['actor_3_name'] + ' ' + df_2020['director_name'] + ' ' + df_2020['genres']

# Dropping rows with any missing values
df_2020 = df_2020.dropna(how='any')

# Reading the old data
try:
    old_df = pd.read_csv('main_data.csv')
except FileNotFoundError:
    old_df = pd.DataFrame()

# Appending the new data
final_df = pd.concat([old_df, df_2020], ignore_index=True)

# Writing to CSV
final_df.to_csv('main_data.csv', index=False)



  df_list = [pd.read_html(str(table))[0] for table in tables]
  df_list = [pd.read_html(str(table))[0] for table in tables]
  df_list = [pd.read_html(str(table))[0] for table in tables]
  df_list = [pd.read_html(str(table))[0] for table in tables]


In [2]:
print(df_2020)

               movie_title                                               comb  \
0               the grudge  Sarah Michelle Gellar  Jason Behr  Clea DuVall...   
1               underwater  Kristen Stewart  Vincent Cassel  Mamoudou Athi...   
2              like a boss  Tiffany Haddish  Rose Byrne  Salma Hayek Migue...   
3            three christs  Richard Gere  Peter Dinklage  Bradley Whitford...   
4        inherit the viper  Tara Buck  Margarita Levieva  Lobo Sebastian A...   
..                     ...                                                ...   
270       we can be heroes  YaYa Gosselin  Pedro Pascal  Priyanka Chopra J...   
271      news of the world  Tom Hanks  Helena Zengel  Tom Astor Paul Green...   
272  one night in miami...  Kingsley Ben-Adir  Eli Goree  Aldis Hodge Regi...   
273  promising young woman  Carey Mulligan  Bo Burnham  Alison Brie Emeral...   
274      pieces of a woman  Vanessa Kirby  Shia LaBeouf  Ellen Burstyn Kor...   

                      genre

In [3]:
final_df


Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
6388,Robert Rodriguez,YaYa Gosselin,Pedro Pascal,Priyanka Chopra Jonas,Action Comedy Drama,we can be heroes,YaYa Gosselin Pedro Pascal Priyanka Chopra J...
6389,Paul Greengrass,Tom Hanks,Helena Zengel,Tom Astor,Action Adventure Drama,news of the world,Tom Hanks Helena Zengel Tom Astor Paul Green...
6390,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regi...
6391,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Crime Drama Mystery,promising young woman,Carey Mulligan Bo Burnham Alison Brie Emeral...
