# Data Collection

In [1]:
from sklearn.preprocessing import MultiLabelBinarizer
import requests
import pandas as pd
import time

In [2]:
mlb = MultiLabelBinarizer()

In [3]:
# Define your API key and base URL
api_key = "e77a6585dca5e685d6366e51538ed27d"  # replace with your API key
base_url = "https://api.themoviedb.org/3/movie/popular"  # endpoint for popular movies

In [4]:
# Set up the initial parameters for the API request
params = {
    'api_key': api_key,
    'language': 'en-US',
    'page': 1  # starting from the first page
}

In [5]:
all_movies = []  # list to store all movies data

In [6]:
# Initial request to get the total number of pages
response = requests.get(base_url, params=params)
data = response.json()

In [None]:
# Check if initial data fetch is successful
if "results" in data:
    total_pages = min(data['total_pages'], 500)  # limit to 500 pages as per TMDb API restriction
    print(f"Total pages to fetch: {total_pages}")
    
    # Iterate over each page to collect all movie data
    for page in range(1, total_pages + 1):
        params['page'] = page
        response = requests.get(base_url, params=params)
        data = response.json()
        
        # Add results to all_movies list if successful
        if "results" in data:
            all_movies.extend(data["results"])
            print(f"Fetched page {page} of {total_pages}")
        else:
            print(f"Error fetching data for page {page}: {data}")
            break  # stop if there’s an error fetching a page
        
        time.sleep(0.2)  # slight delay to avoid hitting rate limits

In [128]:
# Convert to DataFrame
df = pd.DataFrame(all_movies)

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adult              10000 non-null  bool   
 1   backdrop_path      9683 non-null   object 
 2   genre_ids          10000 non-null  object 
 3   id                 10000 non-null  int64  
 4   original_language  10000 non-null  object 
 5   original_title     10000 non-null  object 
 6   overview           10000 non-null  object 
 7   popularity         10000 non-null  float64
 8   poster_path        9948 non-null   object 
 9   release_date       10000 non-null  object 
 10  title              10000 non-null  object 
 11  video              10000 non-null  bool   
 12  vote_average       10000 non-null  float64
 13  vote_count         10000 non-null  int64  
dtypes: bool(2), float64(2), int64(2), object(8)
memory usage: 957.2+ KB


In [132]:
df = df.drop(columns=['backdrop_path'])
df = df.dropna()

In [3]:
df = df[df['genre_ids'] not in [10749, 10751]]

NameError: name 'df' is not defined

In [134]:
df.isna().sum()

adult                0
genre_ids            0
id                   0
original_language    0
original_title       0
overview             0
popularity           0
poster_path          0
release_date         0
title                0
video                0
vote_average         0
vote_count           0
dtype: int64

In [136]:
encoded = mlb.fit_transform(df['genre_ids'])

In [138]:
encoded_df = pd.DataFrame(encoded, columns=mlb.classes_)
df = pd.concat([df, encoded_df], axis=1)

In [140]:
df = df.drop(columns=['genre_ids'])

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9997 entries, 0 to 9467
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adult              9948 non-null   object 
 1   id                 9948 non-null   float64
 2   original_language  9948 non-null   object 
 3   original_title     9948 non-null   object 
 4   overview           9948 non-null   object 
 5   popularity         9948 non-null   float64
 6   poster_path        9948 non-null   object 
 7   release_date       9948 non-null   object 
 8   title              9948 non-null   object 
 9   video              9948 non-null   object 
 10  vote_average       9948 non-null   float64
 11  vote_count         9948 non-null   float64
 12  12                 9948 non-null   float64
 13  14                 9948 non-null   float64
 14  16                 9948 non-null   float64
 15  18                 9948 non-null   float64
 16  27                 9948 non-n

In [144]:
# Define the mapping dictionary with integer keys
genre_map = {
    'Adventure': 12,
    'Fantasy': 14,
    'Animation': 16,
    'Drama': 18,
    'Horror': 27,
    'Action': 28,
    'Comedy': 35,
    'History': 36,
    'Western': 37,
    'Thriller': 53,
    'Crime': 80,
    'Documentary': 99,
    'Science_Fiction': 878,
    'Mystery': 9648,
    'Music': 10402,
    'Romance': 10749,
    'Family': 10751,
    'War': 10752,
    'TV_Movie': 10770
}

In [146]:
# Apply the mapping to create new columns
for genre, col_number in genre_map.items():
    if str(col_number) in df.columns:
        df[f'Genre.{genre}'] = df[str(col_number)]
        df.drop(columns = str(col_number), inplace = True)
    elif col_number in df.columns:
        df[f'Genre.{genre}'] = df[col_number]
        df.drop(columns = col_number, inplace = True)
    else:
        print(f"Column '{col_number}' not found in the DataFrame.")

In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9997 entries, 0 to 9467
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  9948 non-null   object 
 1   id                     9948 non-null   float64
 2   original_language      9948 non-null   object 
 3   original_title         9948 non-null   object 
 4   overview               9948 non-null   object 
 5   popularity             9948 non-null   float64
 6   poster_path            9948 non-null   object 
 7   release_date           9948 non-null   object 
 8   title                  9948 non-null   object 
 9   video                  9948 non-null   object 
 10  vote_average           9948 non-null   float64
 11  vote_count             9948 non-null   float64
 12  Genre.Adventure        9948 non-null   float64
 13  Genre.Fantasy          9948 non-null   float64
 14  Genre.Animation        9948 non-null   float64
 15  Genre.Dra

In [162]:
df['original_title'] = df['title']
df.drop(columns='original_title', inplace=True)

In [168]:
# Save the DataFrame to a CSV file
df.to_csv("updated_genre_data.csv",index=False)

In [166]:
df

Unnamed: 0,adult,id,original_language,overview,popularity,poster_path,release_date,title,video,vote_average,...,Genre.Thriller,Genre.Crime,Genre.Documentary,Genre.Science_Fiction,Genre.Mystery,Genre.Music,Genre.Romance,Genre.Family,Genre.War,Genre.TV_Movie
0,False,1034541.0,en,Five years after surviving Art the Clown's Hal...,6883.159,/63xYQj1BwRFielxsBDXvHIJyXVm.jpg,2024-10-09,Terrifier 3,False,7.300,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,912649.0,en,Eddie and Venom are on the run. Hunted by both...,5590.757,/k42Owka8v91trK1qMYwCQCNwJKr.jpg,2024-10-22,Venom: The Last Dance,False,6.700,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,1184918.0,en,"After a shipwreck, an intelligent robot called...",4321.421,/wTnV3PCVW5O92JMrFvvrRcV39RU.jpg,2024-09-12,The Wild Robot,False,8.543,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,False,698687.0,en,The untold origin story of Optimus Prime and M...,2550.704,/iHPIBzrjJHbXeY9y7VVbEVNt7LW.jpg,2024-09-11,Transformers One,False,8.167,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,False,933260.0,en,Have you ever dreamt of a better version of yo...,2881.789,/lqoMzCcZYEFK729d6qzt349fB4o.jpg,2024-09-07,The Substance,False,7.300,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9463,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9464,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9465,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9466,,,,,,,,,,,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9997 entries, 0 to 9467
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  9948 non-null   object 
 1   id                     9948 non-null   float64
 2   original_language      9948 non-null   object 
 3   overview               9948 non-null   object 
 4   popularity             9948 non-null   float64
 5   poster_path            9948 non-null   object 
 6   release_date           9948 non-null   object 
 7   title                  9948 non-null   object 
 8   video                  9948 non-null   object 
 9   vote_average           9948 non-null   float64
 10  vote_count             9948 non-null   float64
 11  Genre.Adventure        9948 non-null   float64
 12  Genre.Fantasy          9948 non-null   float64
 13  Genre.Animation        9948 non-null   float64
 14  Genre.Drama            9948 non-null   float64
 15  Genre.Hor

# Analysis of Data

In [172]:
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count,Genre.Adventure,Genre.Fantasy,Genre.Animation,Genre.Drama,Genre.Horror,Genre.Action,...,Genre.Thriller,Genre.Crime,Genre.Documentary,Genre.Science_Fiction,Genre.Mystery,Genre.Music,Genre.Romance,Genre.Family,Genre.War,Genre.TV_Movie
count,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,...,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0,9948.0
mean,397700.6,40.099992,6.311956,1708.568456,0.176518,0.127965,0.126759,0.388118,0.162646,0.264475,...,0.249598,0.125352,0.022618,0.120728,0.081122,0.02312,0.166968,0.124045,0.028649,0.021813
std,427477.0,133.422275,1.508475,3200.289091,0.381279,0.334068,0.33272,0.487346,0.369061,0.441076,...,0.432802,0.331134,0.148689,0.325827,0.273036,0.150293,0.372966,0.329649,0.166826,0.146081
min,5.0,7.907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11807.5,16.07175,5.9,86.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,241845.5,21.716,6.566,516.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,758899.8,34.83875,7.164,1815.25,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1378444.0,6883.159,10.0,36478.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [174]:
df["Genre.Adventure"].sum()

1756.0