In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
api_key = 'f422b58e'

def fetch_movie_data(movie_titles, api_key):
    movie_data = []
    
    for title in movie_titles:
        search_url = f'http://www.omdbapi.com/?t={title}&apikey={api_key}'
        response = requests.get(search_url)
        
        # Check for a successful response
        if response.status_code == 200:
            try:
                movie_info = response.json()
                if movie_info['Response'] == 'True':
                    awards_text = movie_info.get('Awards', 'N/A')
                    wins, nominations = "0", "0"
                    if "wins" in awards_text and "nominations" in awards_text:
                        wins = awards_text.split("wins")[0].strip()
                        nominations = awards_text.split("&")[-1].split("nominations")[0].strip()

                    movie_data.append({
                        "Title": movie_info.get('Title', 'N/A'),
                        "Year": movie_info.get('Year', 'N/A'),
                        "Duration": movie_info.get('Runtime', 'N/A'),
                        "IMDB Rating": movie_info.get('imdbRating', 'N/A'),
                        "Genre": movie_info.get('Genre', 'N/A'),
                        "Actors": movie_info.get('Actors', 'N/A'),
                        "Nominations": nominations,
                        "Wins": wins,
                        "Popularity": movie_info.get('imdbVotes', 'N/A')
                    })
                else:
                    print(f"Could not find results for {title}")
            except ValueError:
                print(f"Error decoding JSON for {title}: {response.text}")
        else:
            print(f"Failed to fetch data for {title}: {response.status_code}")
    
    return pd.DataFrame(movie_data)


In [3]:
movie_titles = [
    "Inception", "The Dark Knight", "Interstellar", "The Matrix", "Pulp Fiction",
    "The Lord of the Rings: The Fellowship of the Ring", "The Godfather", "The Shawshank Redemption",
    "Fight Club", "Forrest Gump", "The Empire Strikes Back", "The Dark Knight Rises", "Gladiator",
    "The Silence of the Lambs", "Saving Private Ryan", "Braveheart", "Schindler's List", "The Lion King",
    "Jurassic Park", "The Avengers", "Titanic", "The Departed", "The Wolf of Wall Street", "Django Unchained",
    "The Terminator", "Alien", "Blade Runner", "Goodfellas", "The Usual Suspects", "The Big Lebowski",
    "The Sixth Sense", "Se7en", "Avatar", "Avengers: Endgame", "Back to the Future", "Indiana Jones and the Last Crusade",
    "Harry Potter and the Philosopher's Stone", "Pirates of the Caribbean: The Curse of the Black Pearl",
    "Toy Story", "Finding Nemo", "The Incredibles", "Inside Out", "The Exorcist", "Jaws", "Rocky",
    "A Clockwork Orange", "The Shining", "E.T. the Extra-Terrestrial", "The Breakfast Club", "Ferris Bueller's Day Off",
    "The Truman Show", "The Princess Bride", "Monty Python and the Holy Grail", "Groundhog Day", "The Grand Budapest Hotel",
    "La La Land", "Get Out", "Mad Max: Fury Road", "The Godfather: Part II", "Apocalypse Now", "Blade Runner 2049",
    "Casino Royale", "Doctor Strange", "Guardians of the Galaxy", "Logan", "The Revenant", "Spider-Man: Homecoming",
    "The Hateful Eight", "Once Upon a Time in Hollywood", "Parasite", "The Irishman", "1917",
    "Jojo Rabbit", "Joker", "Frozen", "The Shape of Water", "Three Billboards Outside Ebbing, Missouri",
    "Lady Bird", "The King's Speech", "The Social Network", "The Pursuit of Happyness", "A Beautiful Mind",
    "Shutter Island", "Inglourious Basterds", "Catch Me If You Can", "The Curious Case of Benjamin Button",
    "Black Swan", "Slumdog Millionaire", "The Green Mile", "The Notebook", "A Star Is Born",
    "The Help", "Gravity", "The Martian", "Gone Girl", "Whiplash", "The Revenant",
    "Shawshank Redemption", "Schindler's List", "Raging Bull", "Casablanca", "Citizen Kane",
    "Gone with the Wind", "Lawrence of Arabia", "The Godfather: Part II", "One Flew Over the Cuckoo's Nest", "Star Wars",
    "12 Angry Men", "Psycho", "Rear Window", "The Good, the Bad and the Ugly", "Sunset Boulevard",
    "Silence of the Lambs", "Raiders of the Lost Ark", "It's a Wonderful Life", "American Beauty", "Jaws", "The Exorcist",
    "The Silence of the Lambs", "Saving Private Ryan", "Braveheart", "The Lion King", "Titanic",
    "The Departed", "Gladiator", "Rocky", "E.T. the Extra-Terrestrial", "The Breakfast Club",
    "The Truman Show", "Fight Club", "Harry Potter and the Chamber of Secrets", "The Hunger Games", "The Last Samurai",
    "Pirates of the Caribbean: Dead Man's Chest", "The Prestige", "The Pursuit of Happyness", "Star Trek", "The Hobbit: An Unexpected Journey"
]

In [4]:
# Fetch movie data
df_movies = fetch_movie_data(movie_titles, api_key)

# Display movie data
print(df_movies.head(5))
print(df_movies.tail(5))

             Title  Year Duration IMDB Rating                      Genre  \
0        Inception  2010  148 min         8.8  Action, Adventure, Sci-Fi   
1  The Dark Knight  2008  152 min         9.0       Action, Crime, Drama   
2     Interstellar  2014  169 min         8.7   Adventure, Drama, Sci-Fi   
3       The Matrix  1999  136 min         8.7             Action, Sci-Fi   
4     Pulp Fiction  1994  154 min         8.9               Crime, Drama   

                                              Actors Nominations  \
0  Leonardo DiCaprio, Joseph Gordon-Levitt, Ellio...         220   
1        Christian Bale, Heath Ledger, Aaron Eckhart         165   
2  Matthew McConaughey, Anne Hathaway, Jessica Ch...         148   
3  Keanu Reeves, Laurence Fishburne, Carrie-Anne ...          52   
4      John Travolta, Uma Thurman, Samuel L. Jackson          72   

                Wins Popularity  
0  Won 4 Oscars. 159  2,645,456  
1  Won 2 Oscars. 164  2,974,670  
2    Won 1 Oscar. 44  2,259,078 

In [5]:
# Replace 'N/A' with NaN
df_movies['IMDB Rating'] = df_movies['IMDB Rating'].replace('N/A', pd.NA)

# Convert 'IMDB Rating' to float, coercing errors to NaN
df_movies['IMDB Rating'] = pd.to_numeric(df_movies['IMDB Rating'], errors='coerce')

# Drop rows with NaN values in 'IMDB Rating'
df_movies = df_movies.dropna(subset=['IMDB Rating'])

# Drop duplicates
df_movies = df_movies.drop_duplicates()

# Sort by 'IMDB Rating' and keep the top 100 films
df_movies = df_movies.nlargest(100, 'IMDB Rating').reset_index(drop=True)

# Display the final DataFrame with top 100 movies
print(df_movies)

                       Title  Year Duration  IMDB Rating  \
0   The Shawshank Redemption  1994  142 min          9.3   
1              The Godfather  1972  175 min          9.2   
2            The Dark Knight  2008  152 min          9.0   
3           Schindler's List  1993  195 min          9.0   
4      The Godfather Part II  1974  202 min          9.0   
..                       ...   ...      ...          ...   
95               Jojo Rabbit  2019  108 min          7.9   
96                 Star Trek  2009  127 min          7.9   
97        The Breakfast Club  1985   97 min          7.8   
98  Ferris Bueller's Day Off  1986  103 min          7.8   
99                   Get Out  2017  104 min          7.8   

                        Genre  \
0                       Drama   
1                Crime, Drama   
2        Action, Crime, Drama   
3   Biography, Drama, History   
4                Crime, Drama   
..                        ...   
95         Comedy, Drama, War   
96  Action, Adv

In [6]:
# Create the initial DataFrame
df_actors = pd.DataFrame(df_movies)

# Split the 'Actors' column into lists
df_actors['Actors'] = df_actors['Actors'].str.split(', ')

# Explode the DataFrame to separate the actors into individual rows
df_actors = df_actors.explode('Actors')[['Actors']].reset_index(drop=True)

# Display the new DataFrame with individual actors
print(df_actors)

               Actors
0         Tim Robbins
1      Morgan Freeman
2          Bob Gunton
3       Marlon Brando
4           Al Pacino
..                ...
293         Alan Ruck
294          Mia Sara
295    Daniel Kaluuya
296  Allison Williams
297  Bradley Whitford

[298 rows x 1 columns]


In [7]:
# Alphabetize the DataFrame by 'Actors'
df_actors = df_actors.sort_values(by='Actors').reset_index(drop=True)

# Count duplicates for each actor
actor_counts = df_actors['Actors'].value_counts().reset_index()
actor_counts.columns = ['Actors', 'Count']

# Merge the counts back into the exploded DataFrame
df_actors = df_actors.merge(actor_counts, on='Actors')

# Drop duplicates
df_actors = df_actors.drop_duplicates().reset_index(drop=True)

# Display the final DataFrame
print(df_actors)

              Actors  Count
0      Aaron Eckhart      1
1          Al Pacino      2
2          Alan Ruck      1
3      Albert Brooks      1
4      Alec Guinness      1
..               ...    ...
234   William Holden      1
235  Woody Harrelson      1
236   Zachary Quinto      1
237      Zazie Beetz      1
238      Zoe Saldana      1

[239 rows x 2 columns]


In [8]:
# Sort by count in descending order and keep top 100 actors
top_actors = actor_counts.nlargest(100, 'Count').reset_index(drop=True)

# Display the final DataFrame with top 100 actors
print(top_actors)

               Actors  Count
0   Leonardo DiCaprio      8
1       Harrison Ford      6
2           Tom Hanks      5
3      Robert De Niro      4
4        Kevin Spacey      3
..                ...    ...
95         Will Smith      1
96     William Holden      1
97    Woody Harrelson      1
98     Zachary Quinto      1
99        Zazie Beetz      1

[100 rows x 2 columns]
