# imdb_2.csv

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Path to the original dataset
file_path = '/Users/chiaraferrara/Desktop/Unipi/Magistrale/DM 2/dm2_project/dm2_dataset_2425_imdb/imdb_2.csv'

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)

In [18]:
df.columns

Index(['startYear', 'runtimeMinutes', 'totalCredits', 'canHaveEpisodes',
       'numRegions', 'ratingCount', 'castNumber', 'companiesNumber',
       'averageRating', 'writerCredits', 'directorsCredits',
       'totalNominations', 'totalMedia', 'totalReviews', 'Asia', 'Africa',
       'Europe', 'North America', 'South America', 'Australia',
       'Continent Unknown', 'genre1', 'genre2', 'genre3', 'movie', 'short',
       'tvEpisode', 'tvMiniSeries', 'tvMovie', 'tvSeries', 'tvShort',
       'tvSpecial', 'video', 'videoGame'],
      dtype='object')

In [19]:
# Delta credits = totalCredits - (castNumber + writerCredits + directorsCredits)
df['deltaCredits'] = df['totalCredits'] - (df['castNumber'] + df['writerCredits'] + df['directorsCredits'])

In [20]:
#drop columns that are not needed
columns_to_drop = [
    'totalCredits', 'castNumber', 'writerCredits', 'directorsCredits']

In [21]:
df = df.drop(columns=columns_to_drop)

In [22]:
df.columns

Index(['startYear', 'runtimeMinutes', 'canHaveEpisodes', 'numRegions',
       'ratingCount', 'companiesNumber', 'averageRating', 'totalNominations',
       'totalMedia', 'totalReviews', 'Asia', 'Africa', 'Europe',
       'North America', 'South America', 'Australia', 'Continent Unknown',
       'genre1', 'genre2', 'genre3', 'movie', 'short', 'tvEpisode',
       'tvMiniSeries', 'tvMovie', 'tvSeries', 'tvShort', 'tvSpecial', 'video',
       'videoGame', 'deltaCredits'],
      dtype='object')

In [23]:
onehot_cols = ['movie', 'short', 'tvEpisode', 'tvMiniSeries', 'tvMovie', 
               'tvSeries', 'tvShort', 'tvSpecial', 'video', 'videoGame']

# Trova per ogni riga la colonna con valore 1 (cioè la categoria)
df['titleType'] = df[onehot_cols].idxmax(axis=1)

df = df.drop(columns=onehot_cols)

In [24]:
# Perform a stratified 70/30 split based on the 'titleType' column
train_df, test_df = train_test_split(
    df,
    test_size=0.3,
    stratify=df['titleType'],
    random_state=42
)

In [28]:
# Determine the output directory (same as input file)
output_dir = os.path.dirname(file_path)

# Define output file paths
train_path = os.path.join(output_dir, 'train70_clf.csv')
test_path = os.path.join(output_dir, 'test30_clf.csv')

# Save the train and test splits as CSV files
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

# Confirm save paths
print(f"Train set saved to: {train_path}")
print(f"Test set saved to: {test_path}")

Train set saved to: /Users/chiaraferrara/Desktop/Unipi/Magistrale/DM 2/dm2_project/dm2_dataset_2425_imdb/train70_clf.csv
Test set saved to: /Users/chiaraferrara/Desktop/Unipi/Magistrale/DM 2/dm2_project/dm2_dataset_2425_imdb/test30_clf.csv


# Preprocessed_full

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Path to the original dataset
file_path = '/Users/chiaraferrara/Desktop/Unipi/Magistrale/DM 2/dm2_project/dm2_dataset_2425_imdb/preprocessed_full.csv'

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)




In [10]:
df.columns

Index(['originalTitle', 'rating', 'startYear', 'endYear', 'runtimeMinutes',
       'awardWins', 'numVotes', 'worstRating', 'bestRating', 'totalImages',
       'totalVideos', 'totalCredits', 'criticReviewsTotal', 'titleType',
       'awardNominationsExcludeWins', 'canHaveEpisodes', 'isRatable',
       'isAdult', 'numRegions', 'userReviewsTotal', 'ratingCount',
       'countryOfOrigin', 'genres', 'castNumber', 'companiesNumber',
       'averageRating', 'regions', 'externalLinks', 'writerCredits',
       'directorsCredits', 'soundMixes', 'quotesTotal', 'totalMedia',
       'totalNominations', 'deltaCredits', 'regions_freq_enc', 'regions_EU',
       'regions_NA', 'regions_AS', 'regions_AF', 'regions_OC', 'regions_SA',
       'regions_UNK', 'countryOfOrigin_freq_enc', 'countryOfOrigin_NA',
       'countryOfOrigin_AF', 'countryOfOrigin_AS', 'countryOfOrigin_EU',
       'countryOfOrigin_OC', 'countryOfOrigin_SA', 'countryOfOrigin_UNK',
       'reviewsTotal'],
      dtype='object')

Feature Engineering

In [11]:
# Total media = sum of images, videos, quotes, and external links
df['totalMedia'] = df[['totalImages', 'totalVideos', 'quotesTotal', 'externalLinks']].sum(axis=1)

# Delta credits = totalCredits - (castNumber + writerCredits + directorsCredits)
df['deltaCredits'] = df['totalCredits'] - (df['castNumber'] + df['writerCredits'] + df['directorsCredits'])

# Total nominations = awardWins + awardNominationsExcludeWins
df['totalNominations'] = df['awardWins'] + df['awardNominationsExcludeWins']

# Total reviews = userReviewsTotal + criticReviewsTotal
df['reviewsTotal'] = df['userReviewsTotal'] + df['criticReviewsTotal']

#drop columns that are not needed
columns_to_drop = [
    'totalImages', 'totalVideos', 'quotesTotal', 'externalLinks',
    'totalCredits', 'castNumber', 'writerCredits', 'directorsCredits',
    'awardWins', 'awardNominationsExcludeWins',
    'userReviewsTotal', 'criticReviewsTotal',
    'bestRating', 'worstRating', 'isRatable'
]

In [12]:
df = df.drop(columns=columns_to_drop)

In [13]:
df[['countryOfOrigin_freq_enc', 'countryOfOrigin_NA', 'countryOfOrigin_AF',
    'countryOfOrigin_AS', 'countryOfOrigin_EU', 'countryOfOrigin_OC', 'countryOfOrigin_SA', 'countryOfOrigin_UNK' ]]

Unnamed: 0,countryOfOrigin_freq_enc,countryOfOrigin_NA,countryOfOrigin_AF,countryOfOrigin_AS,countryOfOrigin_EU,countryOfOrigin_OC,countryOfOrigin_SA,countryOfOrigin_UNK
0,0.399732,1,0,0,0,0,0,0
1,0.046115,0,0,0,1,0,0,0
2,0.399732,1,0,0,0,0,0,0
3,0.399732,1,0,0,0,0,0,0
4,0.046115,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
149526,0.001417,0,0,0,0,0,1,0
149527,0.017834,0,0,0,1,0,0,0
149528,0.000000,0,0,0,0,0,0,0
149529,0.000000,0,0,0,0,0,0,0


In [14]:
# Perform a stratified 70/30 split based on the 'titleType' column
train_df, test_df = train_test_split(
    df,
    test_size=0.3,
    stratify=df['titleType'],
    random_state=42
)



In [17]:
#Imputation of runtimeMinutes missing values 

from typing import Callable
def impute_runtime_minutes(df: pd.DataFrame, perc: float | None=0.99) -> Callable[[pd.DataFrame], pd.Series]:
    """
    Impute missing values in the 'runtimeMinutes' column of the given DataFrame.
    Assigns to missing values randomly sampled data out of the central perc% range.
    Also imputes the values for rows outside the perc range if not None.
    Imputation is done separately for each 'titleType' category.

    Parameters:
        df (pd.DataFrame): The DataFrame to impute.
        
        perc (float | None): The central percentile range for imputing values. Default is 0.9.

    Returns:
        Callable[[pd.DataFrame], pd.Series]: A function that takes a DataFrame and returns the imputed 'runtimeMinutes' column.
    """
    # If perc is not None, calculate the lower and upper bounds for the central perc% range
    if perc is not None:
        lower_bound = (1 - perc) / 2
        upper_bound = 1 - lower_bound
        perc_threshold = df.groupby('titleType')['runtimeMinutes'].quantile([lower_bound, upper_bound]).unstack()

    # Define the percentiles for each titleType category, while cutting off outliers
    percentiles = df.groupby('titleType')['runtimeMinutes'].quantile([0.3, 0.7]).unstack()

    def impute_rt_mins(df: pd.DataFrame) -> pd.Series:
        """
        Impute missing values in the 'runtimeMinutes' column of the given DataFrame.
        Assigns to missing values randomly sampled data out of the central perc% range.
        Imputation is done separately for each 'titleType' category.
        Parameters:
            df (pd.DataFrame): The DataFrame to impute.
        Returns:
            pd.Series: The imputed 'runtimeMinutes' column.
        """
        # Group the data by 'titleType'
        groups = df.groupby('titleType')['runtimeMinutes']

        # Create a copy of the original column to preserve order
        imputed_runtime = df['runtimeMinutes'].copy()

        # Iterate over each group and impute missing values
        for title_type, group in groups:
            lower = percentiles.loc[title_type, 0.3]
            upper = percentiles.loc[title_type, 0.7]

            # Get valid values within the 30-70 percentile range
            valid_values = group[(group >= lower) & (group <= upper)].dropna()

            # Filter the group to include only rows within the central perc% range
            if perc is not None:
                central_lower = perc_threshold.loc[title_type, lower_bound]
                central_upper = perc_threshold.loc[title_type, upper_bound]
                valid_values = valid_values[(valid_values >= central_lower) & (valid_values <= central_upper)]

            for index in group.index:
                # If the value is outside the central perc% range, assign a random sample from the valid values
                if group[index] < lower or group[index] > upper:
                    imputed_runtime.loc[index] = valid_values.sample(n=1, replace=True, random_state=42).values[0]

            # Sample values for missing entries
            missing_count = group.isna().sum()
            if missing_count > 0:
                sampled_values = valid_values.sample(n=missing_count, replace=True, random_state=42)
                # Assign sampled values to the missing positions
                imputed_runtime.loc[group.index[group.isna()]] = sampled_values.values

        return imputed_runtime
    
    return impute_rt_mins

In [18]:
# Step 1: Fit the imputer on training data
runtime_imputer = impute_runtime_minutes(train_df)

# Step 2: Apply to train
train_df['runtimeMinutes'] = runtime_imputer(train_df)

# Step 3: Apply to test
test_df['runtimeMinutes'] = runtime_imputer(test_df)

In [None]:
# # Determine the output directory (same as input file)
# output_dir = os.path.dirname(file_path)

# # Define output file paths
# train_path = os.path.join(output_dir, 'train70_clf.csv')
# test_path = os.path.join(output_dir, 'test30_clf.csv')

# # Save the train and test splits as CSV files
# train_df.to_csv(train_path, index=False)
# test_df.to_csv(test_path, index=False)

# # Confirm save paths
# print(f"Train set saved to: {train_path}")
# print(f"Test set saved to: {test_path}")

Train set saved to: /Users/chiaraferrara/Desktop/Unipi/Magistrale/DM 2/dm2_project/dm2_dataset_2425_imdb/train70_clf.csv
Test set saved to: /Users/chiaraferrara/Desktop/Unipi/Magistrale/DM 2/dm2_project/dm2_dataset_2425_imdb/test30_clf.csv
