In [4]:
# Import packages here
import pandas as pd
import os
import opendatasets as od
from datetime import datetime

# Download NLTK tokenizer data (if not already downloaded)
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AliPe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Assign the Kaggle data set URL into variable
dataset = 'https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies/data'
od.download(dataset)

Dataset URL: https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies
Downloading tmdb-movies-dataset-2023-930k-movies.zip to .\tmdb-movies-dataset-2023-930k-movies


100%|██████████| 189M/189M [00:13<00:00, 15.0MB/s] 





In [6]:
# Function to assign data types to DataFrame columns
def assign_data_types(df):
    '''
    Assign specified data types to DataFrame columns.
    
    Parameters:
        df (DataFrame): Input DataFrame.
        
    Returns:
        DataFrame: DataFrame with assigned data types.
    '''
    # Define mapping of column names to desired data types
    column_types = {
        'id': int,
        'title': str,
        'vote_average': float,
        'vote_count': int,
        'status': str,
        'revenue': float,
        'runtime': int,
        'adult': bool,
        'backdrop_path': str,
        'budget': float,
        'homepage': str,
        'imdb_id': str,
        'original_language': str,
        'original_title': str,
        'overview': str,
        'popularity': float,
        'poster_path': str,
        'tagline': str,
        'genres': str,
        'production_companies': str,
        'production_countries': str,
        'spoken_languages': str,
        'release_year': int
    }
    
    # Apply specified data types to DataFrame columns
    for column, data_type in column_types.items():
        if column in df.columns:
            df[column] = df[column].astype(data_type)
    
    # Change the release_date manually as we cannot do it in the loop (apparently)
    df['release_date'] = pd.to_datetime(df['release_date'], format='%Y-%m-%d')

    return df

# Function to tokenize overview into sentences
def tokenize_into_sentences(overview):
    '''
    Tokenize overview into sentences using NLTK sent_tokenize.
    
    Parameters:
        overview (str): Input overview text.
        
    Returns:
        list: List of sentences extracted from the overview.
    '''
    if isinstance(overview, str):
        return sent_tokenize(overview)
    else:
        return []

# Define the main processing pipeline function
def process_movie_data(df, drop_na=False):
    '''
    Process movie data by applying data type assignment, filtering, tokenization, and date filtering.
    
    Parameters:
        df (DataFrame): Input DataFrame containing movie data.
        
    Returns:
        DataFrame: Processed DataFrame with cleaned and filtered movie data.
    '''
    df_copy = df.copy()

    # Apply data type assignment to the DataFrame
    assign_data_types(df_copy)

    # Filter for movies with 'status' == 'Released' 
    released_movies_df = df_copy[df_copy['status'] == 'Released']

    # Filter for NA values
    if drop_na:
        released_movies_df = released_movies_df.dropna()

    # Drop specified columns
    released_movies_df = released_movies_df.drop(columns=['backdrop_path', 'homepage', 'poster_path'])

    # Apply sentence tokenization to 'overview' column
    released_movies_df['overview'] = released_movies_df['overview'].apply(tokenize_into_sentences)

    # Process 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages' columns
    columns_to_split = ['tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages']
    for col in columns_to_split:
        released_movies_df[col] = released_movies_df[col].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

    # Define start and end dates for filtering
    start_date = '1900-01-01'
    end_date = datetime.now().strftime('%Y-%m-%d')

    # Filter DataFrame between start_date and end_date
    date_filtered_released_df = released_movies_df[(released_movies_df['release_date'] >= start_date) & (released_movies_df['release_date'] <= end_date)]

    # Return data sorted by release date in ascending order
    return date_filtered_released_df.sort_values(by=['release_date'], ascending=[True])

# Function to write the processed DataFrame to CSV
def write_processed_movie_data(df, processed_csv_path):
    '''
    Write processed DataFrame to CSV file.
    
    Parameters:
        df (DataFrame): Processed DataFrame.
        processed_csv_path (str): Path to save the processed CSV file.
    '''
    # Write the processed DataFrame to CSV while preserving data types
    df.to_csv(processed_csv_path, index=False, encoding='utf-8')

# Function to read the processed DataFrame from CSV
def read_clean_movie_data(processed_csv_path):
    '''
    Read processed DataFrame from CSV file with specified data types.
    
    Parameters:
        processed_csv_path (str): Path to the processed CSV file.
        
    Returns:
        DataFrame: Processed DataFrame read from CSV with assigned data types.
    '''
    # Define the data types mapping based on the processing pipeline
    column_types = {
        'id': int,
        'title': str,
        'vote_average': float,
        'vote_count': int,
        'status': str,
        'revenue': float,
        'runtime': int,
        'adult': bool,
        'budget': float,
        'imdb_id': str,
        'original_language': str,
        'original_title': str,
        'overview': str,
        'popularity': float,
        'tagline': str,
        'genres': str,
        'production_companies': str,
        'production_countries': str,
        'spoken_languages': str,
        'release_year': int,
        'release_date': str  # release_date remains as string when read from CSV
    }
    
    # Read the CSV file back into a DataFrame with specified data types and parse dates
    return pd.read_csv(processed_csv_path, dtype=column_types, parse_dates=['release_date'], encoding='utf-8')


In [7]:
# Read the CSV movie data into a new DataFrame
movie_df = pd.read_csv('tmdb-movies-dataset-2023-930k-movies\TMDB_movie_dataset_v11.csv')

# Apply the main processing pipeline to your movie dataframe
processed_movie_df = process_movie_data(movie_df, drop_na=True)

# Write the processed movie data frame into a .csv file
write_processed_movie_data(processed_movie_df, 'processed_movie_data.csv')

In [11]:
# Read the processed movie dataframe
processed_movie_df = read_clean_movie_data('processed_movie_data.csv')