# NOTEBOOK PROJET FINAL

## IMPORT LIBRARIES

In [5]:
import pandas as pd
from datetime import datetime, timedelta
import requests
import gzip
from io import BytesIO
import json

## DAILY EXPORT MOVIES

In [6]:
# Get yesterday's date
today = datetime.now()
yesterday = today - timedelta(days=1)
formatted_date = yesterday.strftime("%m_%d_%Y")

# Get the URL file
url_begin = 'http://files.tmdb.org/p/exports/'
url_end = f'movie_ids_{formatted_date}.json.gz'
full_url = url_begin + url_end

In [7]:
# Send a GET request to the URL
response = requests.get(full_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Read the gzipped content into a BytesIO object
    compressed_data = BytesIO(response.content)

    # Use gzip to decompress the data
    with gzip.GzipFile(fileobj=compressed_data, mode='rb') as f:
        # Read the decompressed JSON data
        json_data = f.read().decode('utf-8')

    # Load JSON data into a Pandas DataFrame
    df = pd.read_json(json_data, lines = True)

    # Now, you have the data in the 'df' DataFrame
    print(df.head())

else:
    # Print an error message if the request was not successful
    print(f"Failed to download the file. Status code: {response.status_code}")

  df = pd.read_json(json_data, lines = True)


   adult     id                      original_title  popularity  video
0  False   3924                             Blondie       1.742  False
1  False   6124                 Der Mann ohne Namen       0.600  False
2  False   8773                 L'Amour à vingt ans       3.555  False
3  False  25449  New World Disorder 9: Never Enough       0.682  False
4  False  31975      Sesame Street: Elmo Loves You!       0.600   True


In [8]:
# Define a regular expression to match English characters
english_pattern = r'^[a-zA-Z0-9\s]+$'

# Use str.contains to filter rows based on the regular expression
df = df[df['original_title'].str.contains(english_pattern, na=False)]

# Filter columns
df = df.loc[:,['id', 'original_title', 'popularity']]

# Sort by popularity and filter 1000 first rows
data = df.sort_values(by=['popularity'], ascending=False).head(1000)

# Create CSV files
df.to_csv('src/daily_export_movies.csv', index=False) # all movies
data.to_csv('src/daily_export_movies_top_1000.csv', index=False) # first 1000 rows

## API CONNEXION

In [9]:
# Define API key as headers
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJhMTFkY2JjYzE4MTFlNWIxOGI3MDg1MTIyOWRiOGYzZSIsInN1YiI6IjY1OTQzNWVlY2U0ZGRjNmQ5MDdlYWQxNSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.5VYKD-qYGgixOfyjsDIR5We_wmJklWml5waulWzQVTA"
}

### 1) MOVIE DETAILS (JSON)

In [10]:
# URL + Headers
url_start = "https://api.themoviedb.org/3/movie/"
url_end = '?language=en-US'

# Boucle for pour récupérer les infos de toutes les données
movie_data_list = []

for movie_id in data.iloc[:,0]:
    url = url_start + str(movie_id) + url_end
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Assuming the response content is JSON, you may need to adjust this accordingly
        movie_details = response.json()
        
        # Extract only the desired fields
        movie_data = {
            "id": movie_details.get("id", None),
            "original_title": movie_details.get("original_title", None),
            "title": movie_details.get("title", None),
            "genres": movie_details.get("genres", []),
            "release_date": movie_details.get("release_date", None),
            "vote_average": movie_details.get("vote_average", None),
            "poster_path": movie_details.get("poster_path", None),
        }

        # Append the selected movie data to the list
        movie_data_list.append(movie_data)
    else:
        print(f"Error fetching details for movie_id: {movie_id}")

In [11]:
# Specify the path to the JSON file
json_file_path = "src/movie_details.json"

# Write the list of dictionaries to the JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(movie_data_list, json_file, indent=2)

print(f"Movie data has been stored in {json_file_path}")

Movie data has been stored in src/movie_details.json


### 2) MOVIE KEYWORDS (JSON)

In [12]:
# URL + Headers
url_start = "https://api.themoviedb.org/3/movie/"
url_end = '/keywords'

# Boucle for pour récupérer les infos de toutes les données
movie_keywords_list = []

for movie_id in data.iloc[:,0]:
    url = url_start + str(movie_id) + url_end
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Assuming the response content is JSON, you may need to adjust this accordingly
        keywords = response.json()
        
        # Append the selected movie data to the list
        movie_keywords_list.append(keywords)
    else:
        print(f"Error fetching details for movie_id: {movie_id}")

In [13]:
# Specify the path to the JSON file
json_file_path = "src/movie_keywords.json"

# Write the list of dictionaries to the JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(movie_keywords_list, json_file, indent=2)

print(f"Movie keywords has been stored in {json_file_path}")

Movie keywords has been stored in src/movie_keywords.json


### 3) MOVIE CREDITS (JSON)

In [14]:
# URL + Headers
url_start = "https://api.themoviedb.org/3/movie/"
url_end = '/credits'

# Boucle for pour récupérer les infos de toutes les données
movie_credits_list = []

for movie_id in data.iloc[:,0]:
    url = url_start + str(movie_id) + url_end
    response = requests.get(url, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Assuming the response content is JSON, you may need to adjust this accordingly
        credits = response.json()
        
        # Extract only the desired fields
        credits_data = {
            "id": credits.get("id", None),
            "cast": []
        }
        
        # Extract actor information
        cast_data = credits.get('cast', [])
        for actor in cast_data:
            actor_info = {
                "id_actor": actor.get("id", None),
                "name": actor.get("name", None)
            }
            credits_data["cast"].append(actor_info)

        # Append the selected movie data to the list
        movie_credits_list.append(credits_data)
    else:
        print(f"Error fetching details for movie_id: {movie_id}")

In [15]:
# Specify the path to the JSON file
json_file_path = "src/movie_credits.json"

# Write the list of dictionaries to the JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(movie_credits_list, json_file, indent=2)

print(f"Movie keywords has been stored in {json_file_path}")

Movie keywords has been stored in src/movie_credits.json


## COMPILATION INFOS CSV

### movie_details.json

In [16]:
# Load JSON data
json_file_path = 'src/movie_details.json'

with open(json_file_path, 'r') as json_file:
    json_data = json.load(json_file)

# Create dataframe from imported json
df_movie = pd.DataFrame(json_data)

# Convert "genres" column from dict to string
df_movie['genres'] = df_movie['genres'].apply(lambda x: [genre['name'] for genre in x]).apply(lambda x: ', '.join(x))

# Convert "release_date" column and remove unreleased movies
df_movie['release_date'] = pd.to_datetime(df_movie['release_date'])
df_movie = df_movie.dropna(subset=['release_date'])

In [17]:
df_movie

Unnamed: 0,id,original_title,title,genres,release_date,vote_average,poster_path
0,1029575,The Family Plan,The Family Plan,"Action, Comedy",2023-12-14,7.398,/a6syn9qcU4a54Lmi3JoIr1XvhFU.jpg
1,891699,Silent Night,Silent Night,"Action, Crime",2023-11-30,5.862,/tlcuhdNMKNGEVpGqBZrAaOOf1A6.jpg
2,572802,Aquaman and the Lost Kingdom,Aquaman and the Lost Kingdom,"Action, Adventure, Fantasy",2023-12-20,6.487,/8xV47NDrjdZDpkVcCFqkdHa3T0C.jpg
3,1071215,Thanksgiving,Thanksgiving,"Horror, Mystery, Thriller",2023-11-16,6.700,/f5f3TEVst1nHHyqgn7Z3tlwnBIH.jpg
4,1131755,A Creature Was Stirring,A Creature Was Stirring,Horror,2023-12-08,5.000,/ikQG3byEFyfwcnF0zmyNtXTmr5v.jpg
...,...,...,...,...,...,...,...
995,632727,Texas Chainsaw Massacre,Texas Chainsaw Massacre,"Horror, Thriller",2022-02-18,5.208,/7sKiGNWFM15WNyY7LYd5vmb3brO.jpg
996,9336,Police Academy,Police Academy,"Comedy, Crime",1984-03-22,6.666,/m5a1U549gokC1kxsqgQoRb6XpFg.jpg
997,761851,His Only Son,His Only Son,"Drama, Fantasy",2023-03-30,7.509,/e85UDGmD9HmKcS0BOgoha029bkK.jpg
998,9702,Bound by Honor,Bound by Honor,"Crime, Action, Drama, Thriller",1993-02-05,8.111,/gvP6R6juhe2IpCG7QGDgjyUvm0g.jpg


### movie_keywords.json

In [18]:
# Load JSON data
json_file_path = 'src/movie_keywords.json'

with open(json_file_path, 'r') as json_file:
    json_data = json.load(json_file)

# Create dataframe from imported json
df_keywords = pd.DataFrame(json_data)

# Convert "genres" column from dict to string
df_keywords['keywords'] = df_keywords['keywords'].apply(lambda x: [genre['name'] for genre in x]).apply(lambda x: ', '.join(x))

In [19]:
df_keywords

Unnamed: 0,id,keywords
0,1029575,"assassin, las vegas, family, hidden identity, ..."
1,891699,"holiday, revenge, death of son, christmas, no ..."
2,572802,"superhero, secret society, half-brother, seque..."
3,1071215,"small town, sadism, holiday, massachusetts, th..."
4,1131755,"monster, nurse, mythical creature, creature, b..."
...,...,...
995,632727,"mass murder, texas, gore, sequel, murder, deca..."
996,9336,"trainer, recruit, shenanigan, police academy"
997,761851,"sacrifice, faith, son, bible, father, god, abr..."
998,9702,"street gang, ghetto, juvenile prison, artist, ..."


### movie_credits.json

In [20]:
# Load JSON data
json_file_path = 'src/movie_credits.json'

with open(json_file_path, 'r') as json_file:
    json_data = json.load(json_file)

# Create dataframe from imported json
df_credits = pd.DataFrame(json_data)

# Convert "genres" column from dict to string
df_credits['cast'] = df_credits['cast'].apply(lambda x: [genre['name'] for genre in x]).apply(lambda x: ', '.join(x))

In [21]:
df_credits

Unnamed: 0,id,cast
0,1029575,"Mark Wahlberg, Michelle Monaghan, Maggie Q, Zo..."
1,891699,"Joel Kinnaman, Kid Cudi, Harold Torres, Catali..."
2,572802,"Jason Momoa, Patrick Wilson, Yahya Abdul-Matee..."
3,1071215,"Patrick Dempsey, Nell Verlaque, Addison Rae, J..."
4,1131755,"Chrissy Metz, Annalise Basso, Connor Paolo, Sc..."
...,...,...
995,632727,"Sarah Yarkin, Elsie Fisher, Mark Burnham, Jaco..."
996,9336,"Steve Guttenberg, Kim Cattrall, G.W. Bailey, B..."
997,761851,"Nicola Mouawad, Sara Seyed, Scot Cooper, Luis ..."
998,9702,"Damian Chapa, Jesse Borrego, Benjamin Bratt, E..."


### Merge in one dataframe

In [22]:
# Merge all dataframe in one : df_merged
merged_df = pd.merge(df_movie, df_keywords, on='id', how='left')

merged_df = pd.merge(merged_df, df_credits, on='id', how='left')

# Convert to CSV
merged_df.to_csv('src/movies.csv', index=False)
print("merged_df has been stored in src/movies.csv")

merged_df has been stored in src/movies.csv


In [23]:
merged_df

Unnamed: 0,id,original_title,title,genres,release_date,vote_average,poster_path,keywords,cast
0,1029575,The Family Plan,The Family Plan,"Action, Comedy",2023-12-14,7.398,/a6syn9qcU4a54Lmi3JoIr1XvhFU.jpg,"assassin, las vegas, family, hidden identity, ...","Mark Wahlberg, Michelle Monaghan, Maggie Q, Zo..."
1,891699,Silent Night,Silent Night,"Action, Crime",2023-11-30,5.862,/tlcuhdNMKNGEVpGqBZrAaOOf1A6.jpg,"holiday, revenge, death of son, christmas, no ...","Joel Kinnaman, Kid Cudi, Harold Torres, Catali..."
2,572802,Aquaman and the Lost Kingdom,Aquaman and the Lost Kingdom,"Action, Adventure, Fantasy",2023-12-20,6.487,/8xV47NDrjdZDpkVcCFqkdHa3T0C.jpg,"superhero, secret society, half-brother, seque...","Jason Momoa, Patrick Wilson, Yahya Abdul-Matee..."
3,1071215,Thanksgiving,Thanksgiving,"Horror, Mystery, Thriller",2023-11-16,6.700,/f5f3TEVst1nHHyqgn7Z3tlwnBIH.jpg,"small town, sadism, holiday, massachusetts, th...","Patrick Dempsey, Nell Verlaque, Addison Rae, J..."
4,1131755,A Creature Was Stirring,A Creature Was Stirring,Horror,2023-12-08,5.000,/ikQG3byEFyfwcnF0zmyNtXTmr5v.jpg,"monster, nurse, mythical creature, creature, b...","Chrissy Metz, Annalise Basso, Connor Paolo, Sc..."
...,...,...,...,...,...,...,...,...,...
994,632727,Texas Chainsaw Massacre,Texas Chainsaw Massacre,"Horror, Thriller",2022-02-18,5.208,/7sKiGNWFM15WNyY7LYd5vmb3brO.jpg,"mass murder, texas, gore, sequel, murder, deca...","Sarah Yarkin, Elsie Fisher, Mark Burnham, Jaco..."
995,9336,Police Academy,Police Academy,"Comedy, Crime",1984-03-22,6.666,/m5a1U549gokC1kxsqgQoRb6XpFg.jpg,"trainer, recruit, shenanigan, police academy","Steve Guttenberg, Kim Cattrall, G.W. Bailey, B..."
996,761851,His Only Son,His Only Son,"Drama, Fantasy",2023-03-30,7.509,/e85UDGmD9HmKcS0BOgoha029bkK.jpg,"sacrifice, faith, son, bible, father, god, abr...","Nicola Mouawad, Sara Seyed, Scot Cooper, Luis ..."
997,9702,Bound by Honor,Bound by Honor,"Crime, Action, Drama, Thriller",1993-02-05,8.111,/gvP6R6juhe2IpCG7QGDgjyUvm0g.jpg,"street gang, ghetto, juvenile prison, artist, ...","Damian Chapa, Jesse Borrego, Benjamin Bratt, E..."
