In [1]:
import os

import pandas as pd

from const import *

genre_df = pd.read_csv(os.path.join(DATA_DIR, "genres.csv"), index_col=0)

In [2]:
# get genre count range for outlier detection
genre_counts = genre_df.groupby(ID).count()
count_mean = genre_counts[GENRE].mean()
count_std = genre_counts[GENRE].std()

count_range = (count_mean - 3 * count_std, count_mean + 3 * count_std)
count_range

(-0.9036903575613164, 4.00647881789528)

In [3]:
# number of outliers
filtered_movies = genre_counts[genre_counts[GENRE] < count_range[1]]
len(genre_counts.index) - len(filtered_movies.index)

3827

In [4]:
valid_movies = pd.read_csv("valid_movies.csv", index_col=0)
cleaned_movies = pd.Series(
    sorted(list(set(valid_movies[MOVIE_ID]).intersection(set(filtered_movies.index))))
)
cleaned_genre_df = genre_df[genre_df.index.isin(cleaned_movies)]
encoded_df = (
    pd.get_dummies(cleaned_genre_df, columns=[GENRE], prefix="").groupby(ID).any()
)
encoded_df["genre_count"] = encoded_df.sum(axis=1)
encoded_df.index.names = [MOVIE_ID]
encoded_df.to_csv("encoded_genres.csv")

In [19]:
import plotly.express as px

valid_genre_counts = cleaned_genre_df[GENRE].value_counts()

# Plot pie chart
fig_pie = px.pie(
    valid_genre_counts,
    values=valid_genre_counts.values,
    names=valid_genre_counts.index,
    title="Genre Distribution of valid movies",
)
fig_pie.update_layout(title_x=0.5)


# Adjust the size of the pie chart to show the whole legend
fig_pie.update_layout(
    legend=dict(orientation="h", yanchor="bottom", x=0.09, y=-0.2),
    width=600,
    height=800,
)

# Show the charts
fig_pie.show()

In [39]:
summed_df = encoded_df.groupby("genre_count", as_index=False).count().iloc[:, :2]
summed_df = summed_df.rename(columns={summed_df.columns[1]: "count"})

valid_genre_counts_count_grouped_pie = px.pie(
    summed_df,
    values=summed_df["count"],
    names=summed_df["genre_count"],
    title="Groups of no. of genres for valid movies",
)
valid_genre_counts_count_grouped_pie.update_layout(title_x=0.5)

valid_genre_counts_count_grouped_pie.update_layout(
    legend=dict(orientation="h", yanchor="bottom", x=0.212525, y=-0.5),
    width=600,
    height=600,
)
valid_genre_counts_count_grouped_pie.show()

In [59]:
import plotly.express as px

# Iterate over the genre counts and create a pie chart for each count
for i in [1, 2, 3, 4]:
    # Filter the dataframe for the current genre count
    filtered_df = encoded_df[encoded_df["genre_count"] == i]

    # Get the genre counts for the current count
    genre_counts = filtered_df.iloc[:, :-1].sum()

    # Plot the pie chart
    fig = px.pie(
        genre_counts,
        values=genre_counts.values,
        names=genre_counts.index,
        title=f"Genre Distribution for {i} Genres",
    )

    # Update the layout to show the whole legend
    fig.update_layout(
        legend=dict(orientation="h", yanchor="bottom", x=0.09, y=-0.2),
        width=600,
        height=800,
    )

    fig.show()

# By decades movies

In [48]:
import pandas as pd

VM = pd.read_csv("valid_movies.csv", index_col=0)
date = pd.read_csv("data/movies.csv", index_col=0, usecols=[0, 2])
ratings = pd.read_csv("data/movies.csv", index_col=0)["rating"]

countries = pd.read_csv("data/countries.csv", index_col=0)
VM_dated = (
    VM.merge(date, left_on="movie_id", right_on="id", how="left")
    .dropna()
    .query("1950 <= date <= 2024")
)
VM_dated_countries = VM_dated.merge(
    countries, left_on="movie_id", right_on="id", how="left"
).dropna()

VM_final = VM_dated_countries
VM_final_ratings  = VM_dated_countries.merge(ratings, left_on="movie_id", right_on="id", how="left").dropna()


In [74]:
import pandas as pd
import numpy as np
import shutil

hk_movies = VM_final.query("country == 'Hong Kong'")
hk_movies_ratings = VM_final_ratings.query("country == 'Hong Kong'").query("rating >= 3.0")
hk_movies_1950s = hk_movies.query("1950 <= date <= 1959")
hk_movies_1960s = hk_movies.query("1960 <= date <= 1969")
hk_movies_1970s = hk_movies.query("1970 <= date <= 1979")
hk_movies_1980s = hk_movies_ratings.query("1980 <= date <= 1989")
hk_movies_1990s = hk_movies_ratings.query("1990 <= date <= 1999")
hk_movies_2000s = hk_movies_ratings.query("2000 <= date <= 2009")
hk_movies_2010s = hk_movies_ratings.query("2010 <= date <= 2019")
hk_movies_2020s = hk_movies_ratings.query("2020 <= date <= 2024")

# Set the random seed for reproducibility
np.random.seed(4563)

# Create a dictionary to store the sampled movie_ids for each decade
sampled_movie_ids = {}

# Define the decades
decades = [ '2010s']

for decade in decades:
    os.makedirs(os.path.join("by_decades_movies", decade), exist_ok=True)

# Loop through each decade
for decade in decades:
    # Get the corresponding dataframe for the decade
    df = globals()[f'hk_movies_{decade}']
    
    # Randomly sample 5 movie_ids from the dataframe
    sampled_ids = df['movie_id'].sample(n=5).tolist()

    # Store the sampled movie_ids in the dictionary
    sampled_movie_ids[decade] = sampled_ids

    # Copy the images of the sampled movies to the corresponding directory
    for movie_id in sampled_ids:
        shutil.copy(f"data/posters/{movie_id}.jpg", f"by_decades_movies/{decade}/{movie_id}.jpg")

    



In [110]:
genre_df = pd.read_csv("data/genres.csv", index_col=0)
VM_genre = genre_df[genre_df.index.isin(VM_dated_countries["movie_id"])]
VM_encoded_df = (
    pd.get_dummies(VM_genre, columns=["genre"], prefix="").groupby("id").any()
)
VM_encoded_df["genre_count"] = VM_encoded_df.sum(axis=1)
VM_encoded_df.index.names = ["id"]
VM_encoded_df.to_csv("VM_encoded_genres.csv")