In [3]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [11]:

# Path: data_exploration.ipynb
# Load the dataset rotten_tomatoes_movies.csv in Data folder
df = pd.read_csv("Data/RT/rotten_tomatoes_movies.csv")

# Replace empty strings with NaN
df.replace("", pd.NA, inplace=True)

# Count the number of NaN values in each column
null_counts = df.isna().sum()

# give a percentage of NaN values in each column
null_counts_percent = null_counts / len(df) * 100

# Display the count of NaN values per column
print(null_counts)
print(null_counts_percent)


#create a dataframe that has only the titles for english movies (based on the language column)
df_english = df[df.originalLanguage == "English"]

# Display the first 5 rows of the dataframe
df_english.head()

# Display the number of rows and columns in the dataframe
df_english.shape

#sort the dataframe by the year column

# add a new coloumn called year and extract the year from the release date column that looks like this format: 2018-08-25
df_english["year"] = df_english.releaseDateStreaming.str[:4]

# sort the dataframe by the year column in descending order
df_english.sort_values(by="year", ascending=False, inplace=True)

# remove the null values in the year column
df_english = df_english[df_english.year.notnull()]

# remove the values that older then 2000
df_english = df_english[df_english.year.astype(int) >= 2000]

# dump the dataframe to a csv file
df_english.to_csv("Data/RT/rotten_tomatoes_movies_english_names.csv", index=False)

# give me a list of all the movie titles that are in the dataframe
df_english.title.tolist()

# store it in a text file
with open("Data/Final/rotten_tomatoes_movies_english_title.txt", "w") as f:
    # if its a flaot print it as a float, otherwise print it as a string
    for item in df_english.title.tolist():
        if isinstance(item, float):
            f.write(f"{item}\n")
        else:
            f.write(f"{item}\n")
            




id                           0
title                      367
audienceScore            70010
tomatoMeter             109381
rating                  129267
ratingContents          129267
releaseDateTheaters     112485
releaseDateStreaming     63838
runtimeMinutes           13827
genre                    11083
originalLanguage         13858
director                  4194
writer                   53142
boxOffice               128515
distributor             120253
soundMix                127341
dtype: int64
id                       0.000000
title                    0.256181
audienceScore           48.869871
tomatoMeter             76.352455
rating                  90.233704
ratingContents          90.233704
releaseDateTheaters     78.519175
releaseDateStreaming    44.561560
runtimeMinutes           9.651817
genre                    7.736392
originalLanguage         9.673456
director                 2.927585
writer                  37.095311
boxOffice               89.708777
distributor    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_english["year"] = df_english.releaseDateStreaming.str[:4]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_english.sort_values(by="year", ascending=False, inplace=True)


In [18]:
# lets try this: https://rotten-tomatoes-api.ue.r.appspot.com/movie/bad_boys 
# !pip3 install requests

import requests
import json
import pandas as pd
import time

movies = []

#open the text file and read it line by line and store it in a list for the first 100 movies
with open("Data/Final/rotten_tomatoes_movies_english_title.txt", "r") as f:
    movies = [line.strip() for line in f.readlines()][4300:4400]
f.close()


#loop through the list and make each of them lowercase and also replace space in between words with _
movies = [movie.lower().replace(" ", "_") for movie in movies]


print(movies)

# Create an empty list to store movie data
movie_data = []

# Loop through each movie title
for movie in movies:

    print(f"Scraping {movie}")
    # Create the API url
    url = f'https://rotten-tomatoes-api.ue.r.appspot.com/movie/{movie}'

    # sleep for 1 secs
    time.sleep(1)

    # Make a GET request to the API
    response = requests.get(url)

    print(response.text)

    # if it returns {"detail":"Not Found"} then skip it or if it says "Internal Server Error" then skip it
    if response.text == '{"detail":"Not Found"}' or response.text == "Internal Server Error":
        continue

    # Convert the response JSON into a Python dictionary
    data = json.loads(response.text)

    # print movie hAS BEEN APPENDED
    print(f"{movie} has been appended")

    # Append the movie data to the list
    movie_data.append(data)

# Create a dataframe from the list of dictionaries
df = pd.DataFrame(movie_data)

# Display the dataframe
df.head()

#append in a csv file if it doesnt exist
df.to_csv("Data/Final/4k-5k.csv", mode="a", header=False)

['handcart', "witchcraft_'70", 'rocky_road_to_dublin', 'confessions_of_a_time_traveler_-_the_man_from_3036', 'in_deep_with_ryan_lochte', 'yearly_departed', 'nowhere_to_land', 'slaughterhouse_of_the_rising_sun', 'paparazzi_eye_in_the_dark', 'unprescribed', 'roped', 'from_the_mixed-up_files_of_mrs._basil_e._frankweiler', 'little_richard', 'henri_dauman:_looking_up', 'dangerous_game', 'the_good_journey', 'zombiefied', 'the_medicine', 'time_warp:_the_greatest_cult_films_of_all-time_volume_1:_midnight_madness', 'a_woman_in_winter', 'death_of_me', 'sweet_parents', 'higher_love', 'cruel_jaws', 'turnover', "a_soldier's_revenge", 'the_human_virus', 'cold_light_of_day', 'dreams_of_gold:_the_mel_fisher_story', 'unlikely_family', 'the_siren', 'eagle_and_the_albatross', 'five_weddings_and_a_felony', 'the_24th', "it_ain't_easy", 'la_leyenda_negra', 'a_close_eye', 'proud_citizen', 'secondhand_hearts', 'dead_reckoning', 'cellular_aftershocks', "roald_dahl's_the_witches", 'the_surrogate', 'cowboys:_a_d

In [13]:
# Load the dataset output_with_movie_data.csv in Data folder
df = pd.read_csv("Data/Final/output_with_movie_data_NAN.csv")

# Replace 0's with NaN
df.replace(0, pd.NA, inplace=True)

# Count the number of NaN values in each column
null_counts = df.isna().sum()

# give a percentage of NaN values in each column
null_counts_percent = null_counts / len(df) * 100

# Display the count of NaN values per column
print(null_counts)

# Display the count of NaN values per column
print(null_counts_percent)

0                          10
1                           0
2                          43
3                           8
4                           1
5                           0
6                        1095
7                           0
8                           0
9                           0
10                          0
domestic_revenue          470
international_revenue    1387
worldwide_revenue         470
budget                   1974
dtype: int64
0                         0.453926
1                         0.000000
2                         1.951884
3                         0.363141
4                         0.045393
5                         0.000000
6                        49.704948
7                         0.000000
8                         0.000000
9                         0.000000
10                        0.000000
domestic_revenue         21.334544
international_revenue    62.959601
worldwide_revenue        21.334544
budget                   89.605084
dtype: float

In [21]:
import requests
import json
import pandas as pd
import time
from tqdm import tqdm
import os

start_movie_index = 5000  # Modify this as needed
end_movie_index = 5100  # Modify this as needed

# Define the checkpoint interval
checkpoint_interval = 100

# Define the folder to store checkpoint files
checkpoint_folder = "Data/Final/Checkpoints"

# Ensure the checkpoint folder exists
os.makedirs(checkpoint_folder, exist_ok=True)

# Function to load movies from file
def load_movies(filepath, start_index, end_index):
    with open(filepath, "r") as f:
        movies = [line.strip() for line in f.readlines()][start_index:end_index]
    return movies

# Function to save data to CSV in a folder
def save_to_csv(data, folder, filename):
    if not data:
        return  # If no data, don't save
    df = pd.DataFrame(data)
    filepath = os.path.join(folder, filename)
    df.to_csv(filepath, mode="a", header=not os.path.exists(filepath), index=False)

# Load movies and prepare the list
movies = load_movies("Data/Final/rotten_tomatoes_movies_english_title.txt", start_movie_index, end_movie_index)
movies = [movie.lower().replace(" ", "_") for movie in movies]

# Create an empty list to store movie data
movie_data = []

# Initialize progress bar
pbar = tqdm(total=end_movie_index-start_movie_index, desc="Scraping movies", unit="movie")

# Loop through each movie title
for i, movie in enumerate(movies, start=start_movie_index):
    try:
        url = f'https://rotten-tomatoes-api.ue.r.appspot.com/movie/{movie}'
        time.sleep(1)
        response = requests.get(url)

        if response.text == '{"detail":"Not Found"}' or "Internal Server Error" in response.text:
            pbar.update(1)
            continue
        data = json.loads(response.text)

        # Append the movie data to the list
        movie_data.append(data)

        # Save a checkpoint every 'checkpoint_interval' movies
        if (i+1) % checkpoint_interval == 0 or i+1 == end_movie_index:
            checkpoint_filename = f"movies_{i+1-checkpoint_interval}-{i+1}.csv"
            save_to_csv(movie_data, checkpoint_folder, checkpoint_filename)
            movie_data = []  # Reset the list after saving

    except Exception as e:
        print(f"An error occurred while scraping {movie}: {e}")
        continue  # Skip to the next movie

    finally:
        pbar.update(1)

# Save any remaining movie data to CSV
if movie_data:
    checkpoint_filename = f"movies_{end_movie_index-len(movie_data)+1}-{end_movie_index}.csv"
    save_to_csv(movie_data, checkpoint_folder, checkpoint_filename)

pbar.close()
print("Scraping complete. Data saved.")


Scraping movies:   0%|          | 0/100 [00:00<?, ?movie/s]

Scraping movies: 152movie [05:50,  2.31s/movie]                      

Scraping complete. Data saved.



