In [2]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('letterbxd_preprocessed.csv')

(All of the following data frame analyses apply to the more well known movies on the letterbxd app and may not apply to the film industry at large)

In [4]:
df.head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
0,Barbie,Comedy,LuckyChap Entertainment,UK,English,2023,3.86,2980018,26000,3658001,448304,1586062,114
1,Parasite,Comedy,Barunson E&A,South Korea,Korean,2019,4.56,2844133,90000,3801915,521089,2176138,133
2,Everything Everywhere All at Once,Science Fiction,IAC Films,USA,English,2022,4.3,2345715,130000,2908165,494949,1473174,140
3,Fight Club,Drama,Fox 2000 Pictures,Germany,English,1999,4.27,2568765,114000,3726744,415981,1719180,139
4,La La Land,Drama,Summit Entertainment,Hong Kong,English,2016,4.09,2285536,204000,3199145,462742,1422472,129
5,Oppenheimer,Drama,Syncopy,UK,English,2023,4.23,2163845,31000,2611340,472979,1046979,181
6,Interstellar,Science Fiction,Legendary Pictures,UK,English,2014,4.35,2445086,194000,3493990,445197,1621993,169
7,Joker,Crime,Warner Bros. Pictures,Canada,English,2019,3.84,2646583,23000,3802316,290806,1552543,122
8,Dune,Science Fiction,Legendary Pictures,USA,English,2021,3.9,2198100,22000,2847537,335566,1059535,155
9,Spider-Man: Into the Spider-Verse,Adventure,Columbia Pictures,USA,English,2018,4.42,2445436,76000,3415337,444758,1757390,117


These are the top 10 films of our data set. Our data set has been sorted based on popularity so these are our top 10 most popular films. We can see that all of them are American films except for Parasite, which is Korean. We can see that nearly all of them have been released in the 21st century except for Fight Club which is still close to it. All of them have higher than average ratings, with the non-English movie having the highest one. All of the studios are famous studios with plenty of other movies. The durations are close to the average and only oppenheimer here is an outlier. We have an extremely high number of people who've watched, liked, and rated the film. The films are on many different lists and have a high number of fans because they are popular shows.

(Sadly the countries in the website are alphabetically ordered therefore the countries may have incorrect data as we can see here. The same does **not** apply to any of the other categorical columns)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Movie Name          5782 non-null   object 
 1   Genre               5782 non-null   object 
 2   Studio              5782 non-null   object 
 3   Country             5782 non-null   object 
 4   Primary Language    5782 non-null   object 
 5   Release Year        5782 non-null   int64  
 6   Avg Rating          5782 non-null   float64
 7   Raters              5782 non-null   int64  
 8   Fans                5782 non-null   int64  
 9   Watched             5782 non-null   int64  
 10  Lists               5782 non-null   int64  
 11  Likes               5782 non-null   int64  
 12  Duration (minutes)  5782 non-null   int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 587.4+ KB


As we can see we've correctly loaded the data set. We have around 5.7K movies with no null columns, 5 categorical columns, 1 float column and 6 integer columns. Only average rating is ordinal

In [6]:
df.describe()

Unnamed: 0,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
count,5782.0,5782.0,5782.0,5782.0,5782.0,5782.0,5782.0,5782.0
mean,2001.329644,3.370865,152057.8,2011.408682,247301.7,38644.395192,61446.24,109.38862
std,20.439581,0.588133,277326.0,8155.814601,440356.5,54184.983922,148833.1,24.411204
min,1911.0,0.88,1408.0,0.0,1903.0,2585.0,554.0,41.0
25%,1992.0,3.01,27391.25,64.0,42380.0,10916.25,7722.0,95.0
50%,2007.0,3.46,55420.0,221.0,89759.5,19220.5,16043.0,105.0
75%,2017.0,3.81,137116.2,852.75,229417.8,40741.75,44206.25,120.0
max,2024.0,4.64,2980018.0,204000.0,3802316.0,521089.0,2176138.0,743.0


We can see the mean and standard deviation for many different films, which can help us see if a movie has above or below average statistics and whether or not they are an outlier.

In [7]:
def plot_features(df, title_test_ = 'Feature Plots', top_n=10, nrows=4, ncols=4):
    fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=[col for col in df.columns]) #, horizontal_spacing=0.1, vertical_spacing=0.15
    row, col = 1, 1

    for column in df.columns:
        if df[column].dtype == 'object' or df[column].dtype == 'category':
            top_10 = df[column].value_counts().nlargest(top_n)
            fig.add_trace(
                go.Bar(x=top_10.values, y=top_10.index, orientation='h', text=top_10.values, textposition='auto'),
                row=row, col=col
            )
            fig.update_xaxes(title_text='Frequency', row=row, col=col)
            fig.update_yaxes(title_text=column, row=row, col=col)
        else:
            fig.add_trace(
                go.Histogram(x=df[column], nbinsx=50, name=column),
                row=row, col=col
            )
            fig.update_xaxes(title_text=column, row=row, col=col)
            fig.update_yaxes(title_text='Frequency', row=row, col=col)

        if col == ncols:
            row += 1
            col = 1
        else:
            col += 1

    fig.update_layout(
        height=1250, width=2000, title_text=title_test_,
    )
    fig.show()


In [8]:
plot_features(df)

We'll go over each feature in a list:

1. **Movie Name:** We can see that some movies have the same name. They seem to be very famous movies that have been redone multiple times and/or popular (European) folktales. But even then we barely have any duplicates of the same name. Note that they are different movies as we'll see below they only have the same name.

1. **Genre:** We can see that the most popular film genre in Letterbxd is Drama by a landslide. Then, comedy is the second most popular genre by a landslide yet again. Other genres have less steep differences. 
1. **Studio:** Famous studios have made many of the movies we see on the app today. A single studio only has 0.5% of the movies here at most though.

1. **Country:** The vast majority of movies in our data set are American movies, which makes sense given how this is an English App. We most likely have even more American movies in our data frame due to the alphabet ordering problem we described earlier, as "U" and "S" are the later letters of the alphabet. Even our top 5 has two other English speaking countries: Canada and the UK. The top 3 non-English speaking countries are France, Japan, and Germany.

1. **Primary Language:** We can once again see the English domination in our data set. Other languages do not even come close to the English language. Interestingly we have no spoken language films which we will analyze more later

1. **Release Year:** Our films are mostly new releases, most even being from the 2010s and 2020s. Older movies are less prevalent in our data set. Our release year is thus left skewed.

1. **Average Rating:** our Average rating is slightly right skewed, meaning that people give relatively high ratings to our movies, most likely because we have gathered the more popular films and so they most likely also have higher ratings. Although that isn't always the case as we'll see later, as some people like hate watching movies, meaning they watch movies they know are bad for entertainment. Or it may be a highly divisive movie that some love and some loathe.

1. **Duration:** We have a right skewed data. If we remove the outliers we may see a pseudo normal plot. Our outliers are very large3 in value though.

1. **The rest of the numerical categories:** They all follow a similar pattern. They are all left skewed, they all have lots of data in their first bin, meaning they don't have high statistics. As we see later, this is because they have an extremely high correlation together. Fans is somewhat of an outlier among them, as it has more values in the first bin and much less in other bins.

In [9]:
shaft_movies = df[df['Movie Name'] == 'Shaft']
shaft_movies.head()

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
1939,Shaft,Action,Shaft Productions,USA,English,1971,3.44,25698,66,42600,20999,9253,100
3047,Shaft,Crime,Davis Entertainment,USA,English,2019,2.7,32340,35,49688,6731,5886,111
5708,Shaft,Action,Paramount,Germany,English,2000,2.88,27183,9,50391,6562,4920,99


In [10]:
def plot_pairplot(df, color=None):
    numerical_df = df.select_dtypes(include=['number'])
    fig = px.scatter_matrix(numerical_df, color=color)
    fig.update_layout(title="Pair Plot of Numerical Features", height=800, width=800)
    fig.show()

In [11]:
plot_pairplot(df)

Here we see more proof that Watched, Lists, Likes, and Raters are extremely correlated and linear. Fans does have some correlation with the but less so, mainly because of it has moore smaller values. The rest are uncorrelated and don't have interesting shapes we can decipher. Duration seems to have the least correlation as its plots with other numerical values are ver similar to its histogram except with the average rating, where it looks like someone has plotted its histogram upside down.

In [12]:
def plot_numerical_distributions(df, color=None, nrows=3, ncols=3):
    numerical_df = df.select_dtypes(include=['number'])
    num_cols = len(numerical_df.columns)
    rows = nrows if num_cols > nrows * ncols else (num_cols // ncols) + 1
    fig = make_subplots(rows=rows, cols=ncols, subplot_titles=[col for col in numerical_df.columns])
    row, col = 1, 1

    for column in numerical_df.columns:
        violinPlot(df, column, color, fig, row, col)

        if col == ncols:
            row += 1
            col = 1
        else:
            col += 1

    fig.update_layout(height=800, width=1200, title_text='Numerical Feature Distributions')
    fig.show()

def violinPlot(df, column, color, fig, row, col):
    fig.add_trace(
        go.Violin(y=df[column], name=column, box_visible=True, meanline_visible=True, line_color=color),
        row=row, col=col
    )
    fig.update_yaxes(title_text=column, row=row, col=col)

def boxPlot(df, column, color=None):
    box_fig = px.box(df, y=column, color=color)
    box_fig.update_layout(title=f'Box Plot of {column}', yaxis_title=column)
    box_fig.show()


In [13]:
plot_numerical_distributions(df)

We will go over the violin and box plots:

1. **Release Year:** We can see more clearly that the vase majority of our films are from the 21st century, with the outliers starting from 1954. The oldest movie is from 1911.  

1. **Average Rating:** Most movies have high ratings as stated before, with the outliers starting from 1.8. 

1. **Duration:** Similar analyses to before. Duration's somewhat compact and the longest movie's 743 minutes which is a 12 hour movie (!!!) and the outliers start from 158 minutes which is 2 hours and a half

1. **The rest of the numerical categories:** All of them are very compact and thus have a lot of outliers as we can see

In [14]:

def plot_correlation_heatmap(df):
    numeric_df = df.select_dtypes(include=['number'])
    corr_matrix = numeric_df.corr()
    annotations = []
    for x in range(len(corr_matrix.columns)):
        for y in range(len(corr_matrix.columns)):
            annotations.append(
                dict(
                    x=corr_matrix.columns[y],
                    y=corr_matrix.columns[x],
                    text=f"{corr_matrix.values[x, y]:.2f}",
                    showarrow=False,
                    font=dict(color='black')
                )
            )

    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        colorscale='Viridis',
        colorbar=dict(title='Correlation')
    ))

    # Add annotations
    fig.update_layout(
        title='Correlation Heatmap',
        width=800,
        height=800,
        annotations=annotations
    )

    fig.show()

In [15]:
plot_correlation_heatmap(df)

Watched, Raters, Lists, and Links all are highly correlated as seen before with one another which makes sense, given how a film that has a high number of raters, likes, and is in a lot of lists is a highly watched movie as well. The more curious one correlations is between the average rating and release year, which shows that as the years have gone by, the average rating for a movie has decreased. An interesting lack of correlation is between average rating and other columns, which means that as stated before, a highly rated film isn't necessarily the most well liked or highly watched film and vice versa. We can also see that the release year and duration of the film don't impact the statistics of the movie as much as well (or at least don't do so linearly)

In [16]:
unique_values = {}
for column in df.columns:
    if df[column].dtype != "float64" and df[column].dtype != "int64":
        unique_values[column] = df[column].unique()
        print(f"Unique values in column '{column}':")
        print(unique_values[column])
        print()
    

Unique values in column 'Movie Name':
['Barbie' 'Parasite' 'Everything Everywhere All at\xa0Once' ...
 'The People’s\xa0Joker' 'The Naked\xa0City' 'Where Eagles\xa0Dare']

Unique values in column 'Genre':
['Comedy' 'Science Fiction' 'Drama' 'Crime' 'Adventure' 'Mystery' 'Horror'
 'Action' 'Animation' 'Romance' 'Family' 'Thriller' 'War' 'Fantasy'
 'History' 'Western' 'Music' 'TV Movie']

Unique values in column 'Studio':
['LuckyChap Entertainment' 'Barunson E&A' 'IAC Films' ...
 'Pure Flix Entertainment' 'Haunted Gay Ride Productions'
 'Jerry Gershwin Productions']

Unique values in column 'Country':
['UK' 'South Korea' 'USA' 'Germany' 'Hong Kong' 'Canada' 'Sweden'
 'Ireland' 'Japan' 'China' 'France' 'Brazil' 'Czechia' 'New Zealand'
 'Australia' 'India' 'Italy' 'Austria' 'Poland' 'Spain' 'Denmark'
 'Colombia' 'Belgium' 'Netherlands' 'Czechoslovakia' 'Bahamas' 'Turkey'
 'United Arab Emirates' 'Mexico' 'USSR' 'Taiwan' 'South Africa' 'Hungary'
 'Chile' 'Finland' 'Serbia' 'Bulgaria' 'Argent

We can see that we have a lot of different countries, languages, studios, film names, and etc. Our genres are the only limited category here as we only have 18 genres. 

In [17]:
Q1 = df['Duration (minutes)'].quantile(0.25)
Q3 = df['Duration (minutes)'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

long_movies = df[(df['Duration (sss)'] > upper_bound)]
plot_features(long_movies, "Long Movies Feature Plots")

KeyError: 'Duration (sss)'

We have found the duration outliers that are long movies longer than around 2 hours and a half. The main differences seem to be that longer movies are much fewer comedies but more action, fantasy, history, and action. They are slightly more non western (less european languages), they have more fans, likes, ratings and are on more lists and tend to be as watched as regular movies. They tend to be more prominent in the 60s-80s compared to regular movies

In [None]:
long_movies.sort_values(by='Duration (minutes)', ascending=False).head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
3688,Out 1,Drama,Les Films du Losange,France,French,1971,4.06,3762,139,6117,11376,2174,743
2032,War and Peace,Drama,Mosfilm,USSR,Russian,1967,4.3,9671,352,14695,17006,5032,422
2552,The Best of Youth,History,BiBi Film,Italy,Italian,2003,4.28,10352,843,15850,13386,5785,366
1591,Napoleon,Drama,Société Westi,France,No spoken language,1927,4.23,11438,302,17978,25967,5666,333
2712,The Kingdom,Horror,Greco,Germany,Danish,1994,4.03,12902,95,21085,10421,6257,277
5731,"Dr. Mabuse, the Gambler",Crime,Uco-Film GmbH,Germany,No spoken language,1922,3.95,8880,52,14738,16783,3502,271
2391,Cleopatra,History,MCL Films S.A.,UK,English,1963,3.48,20379,135,34871,15268,5806,248
1221,Kill Bill: The Whole Bloody Affair,Thriller,Super Cool ManChu,Japan,English,2006,4.33,54677,2600,106705,15113,28679,247
3530,Zack Snyder’s Justice League,Action,Warner Bros. Pictures,USA,English,2021,3.41,420870,2400,579935,129900,145156,242
2356,Hamlet,Drama,Castle Rock Entertainment,UK,English,1996,3.68,25294,141,48242,11647,8618,242


We can see that the longest movie on our list is a french movie from the 1970s called Out 1 which had been divided into 100-90 minute parts according toi wikipedia. We can see that the movies tend to be less English, higher rated, and older. They're also not comedies and have middling popularity.

In [18]:
mute_movies = df[df['Primary Language'] == 'No spoken language']
plot_features(mute_movies)

We can see that nearly all mute movies are older, but not all. They are more diverse in genre and are similar to the general genres, and have lower durations compared to the average. Their ratings vary a bit more and they are less popular well liked and watched than other movies. 

In [None]:
mute_movies.head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
407,The Red Turtle,Family,Wild Bunch,Belgium,No spoken language,2016,3.87,60262,435,88615,29180,23314,80
495,The Gold Rush,Drama,Charles Chaplin Productions,USA,No spoken language,1925,4.14,64458,208,109093,45669,27401,95
1288,The Phantom Carriage,Horror,SF Studios,Sweden,No spoken language,1921,4.07,23879,164,35985,31400,10495,106
1576,Faust,Fantasy,UFA,Germany,No spoken language,1926,4.09,19504,186,30107,24333,8902,116
1591,Napoleon,Drama,Société Westi,France,No spoken language,1927,4.23,11438,302,17978,25967,5666,333
1955,The Phantom of the Opera,Drama,Universal Pictures,USA,No spoken language,1925,3.72,23211,123,38084,26544,9380,107
2076,Silent Night,Action,Thunder Road,USA,No spoken language,2023,2.38,31400,4,37100,12336,4975,104
2161,Begotten,Fantasy,Theatre Of Material,USA,No spoken language,1989,2.97,20033,223,33200,13114,6462,72
2189,The Circus,Romance,Charles Chaplin Productions,USA,No spoken language,1928,4.03,29845,86,47382,20741,12195,72
2237,Strike,Drama,Proletkult,USSR,No spoken language,1925,3.88,16372,63,27940,18129,6395,89


Here we can see the top 10 most popular no spoken movies. We can see that metro golden mayer is an old studio as well because they have older movies

In [62]:
def plot_trends_by_year_and_category(df, year_column, numerical_column, categorical_column, top_n=5):
    top_categories = df[categorical_column].value_counts().nlargest(top_n).index
    filtered_df = df[df[categorical_column].isin(top_categories)]
    grouped_df = filtered_df.groupby([year_column, categorical_column]).agg(
        total_value=(numerical_column, 'sum'),
        count=(categorical_column, 'count')
    ).reset_index()
    fig = px.line(grouped_df, x=year_column, y='total_value', color=categorical_column,
                  title=f'{numerical_column} Trends by Year for Top {top_n} {categorical_column}',
                  labels={year_column: 'Year', 'total_value': numerical_column, categorical_column: categorical_column})
    fig.show()


In [43]:
def plot_trends_by_year_cumulative(df, year_column, numerical_categories, nrows=1, ncols=1):
    fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=[f'Cumulative Trends by Year'])

    for col_num, numerical_column in enumerate(numerical_categories, start=1):
        grouped_df = df.groupby(year_column).agg(
            total_value=(numerical_column, 'sum'),
            count=(year_column, 'size')
        ).reset_index()

        fig.add_trace(
            go.Scatter(x=grouped_df[year_column], y=grouped_df['total_value'], name=numerical_column, mode='lines'),
            row=1, col=1
        )

    fig.update_xaxes(title_text='Year', row=nrows, col=ncols)
    fig.update_yaxes(title_text='Value', row=nrows, col=ncols)
    fig.update_layout(title_text='Cumulative Trends by Year', height=600, width=800)

    fig.show()

In [46]:
numerical_categories = ["Raters", "Likes", "Lists", "Watched", "Fans"]
plot_trends_by_year_cumulative(df, year_column='Release Year', numerical_categories = numerical_categories)

As we can see the sum of all watched, liked, listed, and rated movies has been rising exponentially over the years until 2020, when covid hit and massively reduced all of these statistics. To see if the reason for it was the lack of new movies or the covid movies being much less popular, we have to see.

In [67]:
numerical_categories = ["Watched", "Fans"]
for num_cat in numerical_categories:
    plot_trends_by_year_and_category(df, year_column='Release Year', numerical_column=num_cat, categorical_column="Genre")

We can see that Comedy and Drama films are constantly battling for the number one place, with lately comedy being the winner.Drama however has a lot more fans as we can see and only lately has it been taken over by comedy. Romance movies have quite a few fans as well compared to their watch statistics

In [70]:
def plot_trends_by_year_average(df, year_column, numerical_column):
    grouped_df = df.groupby(year_column).agg(
        average_value=(numerical_column, 'mean'),
        count=(year_column, 'size')
    ).reset_index()
    fig_value = px.line(grouped_df, x=year_column, y='average_value',
                        title=f'Average {numerical_column} Trends by Year',
                        labels={year_column: 'Year', 'average_value': f'Average {numerical_column}'})
    
    fig_value.show()

In [71]:
def plot_trends_by_year_and_category_average(df, year_column, numerical_column, categorical_column, top_n=5):
    top_categories = df[categorical_column].value_counts().nlargest(top_n).index
    filtered_df = df[df[categorical_column].isin(top_categories)]
    grouped_df = filtered_df.groupby([year_column, categorical_column]).agg(
        total_value=(numerical_column, 'mean'),
        count=(categorical_column, 'count')
    ).reset_index()
    fig = px.line(grouped_df, x=year_column, y='total_value', color=categorical_column,
                  title=f'{numerical_column} Trends by Year for Top {top_n} {categorical_column}',
                  labels={year_column: 'Year', 'total_value': numerical_column, categorical_column: categorical_column})
    fig.show()


In [72]:
numerical_categories = ["Watched", "Fans", "Duration (minutes)", "Avg Rating"]
for num_cat in numerical_categories:
    plot_trends_by_year_average(df, year_column='Release Year', numerical_column=num_cat)

We can see that the average rating has been consistently decreasing. This may be because the quality of modern films has been decreasing, or that we don't have popular older movies that the people disliked and gave a low rating to since bad old movies are just not watched anymore and don't really gain traction compared to a new bad movie that everyone talks about because of how bad it is. Movie durations have stayed the same throuought time, and indeed it seems like movies made during covid were much less popular. The most fan favored movies seem to be from the late 90s onwards and mid 1970s to early 1980s. 

In [73]:
numerical_categories = ["Watched", "Fans", "Duration (minutes)", "Avg Rating"]
for num_cat in numerical_categories:
    plot_trends_by_year_and_category_average(df, year_column='Release Year', numerical_column=num_cat, categorical_column="Genre")

The genre trends seem to be similar, with only slight differences due to the popularity of the genre itself. There are some weird peaks and wells as well.

In [112]:

def plot_top_genre_trends_by_year(df, year_column, categorical_column, top_n=5):
    top_genres = df[categorical_column].value_counts().nlargest(top_n).index
    filtered_df = df[df[categorical_column].isin(top_genres)]
    grouped_df = filtered_df.groupby([year_column, categorical_column]).size().reset_index(name='count')
    fig = px.line(grouped_df, x=year_column, y='count', color=categorical_column,
                  title=f'Top {top_n} {categorical_column} Counts by Year',
                  labels={year_column: 'Year', 'count': 'Count', categorical_column: categorical_column})
    
    fig.show()

In [113]:
plot_top_genre_trends_by_year(df, "Release Year", "Genre")

The most popular movies seem to be dramas Except during the early 1980s where horror was the most popular genre and brief moments where comedy took over. note that this is the *number* of movies and not the total watch count of movies in a genre.

In [114]:
plot_top_genre_trends_by_year(df, "Release Year", "Studio")

The data's too low to do any proper analyses on it

In [115]:
plot_top_genre_trends_by_year(df, "Release Year", "Primary Language")

English dominates Letterbxd, with foreign movies such as spanish or french only gaining a bit traction after covid

In [116]:
plot_top_genre_trends_by_year(df, "Release Year", "Country")

The Letterbxd data set is very dominated by America with little change over the years for movies set in other countries

In [76]:
def plot_top_genre_trends_by_year_excluding_most_popular(df, year_column, categorical_column, top_n=5):
    top_genres = df[categorical_column].value_counts().nlargest(top_n + 1).index
    top_genres = top_genres[1:]
    filtered_df = df[df[categorical_column].isin(top_genres)]
    grouped_df = filtered_df.groupby([year_column, categorical_column]).size().reset_index(name='count')
    fig = px.line(grouped_df, x=year_column, y='count', color=categorical_column,
                  title=f'Top {top_n} {categorical_column} Counts by Year (Excluding Most Popular {categorical_column})',
                  labels={year_column: 'Year', 'count': 'Count', categorical_column: categorical_column})
    
    fig.show()

In [77]:
plot_top_genre_trends_by_year_excluding_most_popular(df, year_column='Release Year', categorical_column='Country')

If we exclude America we can see the rise of other countries over time although the UK is usually the first country. Canada has seen a rise in the number of its films and Germany has seen a fall after its peak during the 2000s. Japan and France have remained consistent

In [119]:
plot_top_genre_trends_by_year_excluding_most_popular(df, year_column='Release Year', categorical_column='Primary Language')

French movies have once again taken over Japanese movies, with Spanish and Italian movies rising.

Next lets take a look at the top 3 studios, genres, countries, and primary languages and their features

In [90]:
top_studios = df["Studio"].value_counts().nlargest(3).index
for studio in top_studios:
    studio_df = df[df["Studio"] == studio]
    plot_features(studio_df, studio, 10)

In [94]:
top_countries = df["Country"].value_counts().nlargest(4).index
for country in top_countries:
    country_df = df[df["Country"] == country]
    plot_features(country_df, country, 10)

In [93]:
top_languages = df["Primary Language"].value_counts().nlargest(4).index
for language in top_languages:
    language_df = df[df["Primary Language"] == language]
    plot_features(language_df, language, 10)

In [95]:
top_genres = df["Genre"].value_counts().nlargest(3).index
for genre in top_genres:
    genre_df = df[df["Genre"] == genre]
    plot_features(genre_df, genre, 10)

In [83]:
temp_df = df.sort_values(by=['Watched'], ascending = False )
temp_df.head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
7,Joker,Crime,Warner Bros. Pictures,Canada,English,2019,3.84,2646583,23000,3802316,290806,1552543,122
1,Parasite,Comedy,Barunson E&A,South Korea,Korean,2019,4.56,2844133,90000,3801915,521089,2176138,133
3,Fight Club,Drama,Fox 2000 Pictures,Germany,English,1999,4.27,2568765,114000,3726744,415981,1719180,139
0,Barbie,Comedy,LuckyChap Entertainment,UK,English,2023,3.86,2980018,26000,3658001,448304,1586062,114
6,Interstellar,Science Fiction,Legendary Pictures,UK,English,2014,4.35,2445086,194000,3493990,445197,1621993,169
16,The Dark Knight,Action,DC Comics,UK,English,2008,4.47,2262631,82000,3466661,461171,1557080,152
9,Spider-Man: Into the Spider-Verse,Adventure,Columbia Pictures,USA,English,2018,4.42,2445436,76000,3415337,444758,1757390,117
17,Inception,Action,Legendary Pictures,UK,English,2010,4.19,2157509,59000,3380615,378632,1374049,148
10,Pulp Fiction,Crime,Miramax,USA,English,1994,4.26,2195213,72000,3337922,411328,1476451,154
14,Get Out,Horror,Monkeypaw Productions,USA,English,2017,4.16,2317735,24000,3305206,361479,1461192,104


In [84]:
temp_df = df.sort_values(by=['Likes'], ascending = False)
temp_df.head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
1,Parasite,Comedy,Barunson E&A,South Korea,Korean,2019,4.56,2844133,90000,3801915,521089,2176138,133
9,Spider-Man: Into the Spider-Verse,Adventure,Columbia Pictures,USA,English,2018,4.42,2445436,76000,3415337,444758,1757390,117
3,Fight Club,Drama,Fox 2000 Pictures,Germany,English,1999,4.27,2568765,114000,3726744,415981,1719180,139
6,Interstellar,Science Fiction,Legendary Pictures,UK,English,2014,4.35,2445086,194000,3493990,445197,1621993,169
0,Barbie,Comedy,LuckyChap Entertainment,UK,English,2023,3.86,2980018,26000,3658001,448304,1586062,114
16,The Dark Knight,Action,DC Comics,UK,English,2008,4.47,2262631,82000,3466661,461171,1557080,152
7,Joker,Crime,Warner Bros. Pictures,Canada,English,2019,3.84,2646583,23000,3802316,290806,1552543,122
10,Pulp Fiction,Crime,Miramax,USA,English,1994,4.26,2195213,72000,3337922,411328,1476451,154
2,Everything Everywhere All at Once,Science Fiction,IAC Films,USA,English,2022,4.3,2345715,130000,2908165,494949,1473174,140
14,Get Out,Horror,Monkeypaw Productions,USA,English,2017,4.16,2317735,24000,3305206,361479,1461192,104


The four top 10s once agian prove that these 4 categories, Raters Watched Lists and likes are highly correlated as even the top 10 movies for each of them are very similar compared to say the top 10 high fan count movies below

In [85]:
temp_df = df.sort_values(by=['Fans'], ascending = False)
temp_df.head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
4,La La Land,Drama,Summit Entertainment,Hong Kong,English,2016,4.09,2285536,204000,3199145,462742,1422472,129
6,Interstellar,Science Fiction,Legendary Pictures,UK,English,2014,4.35,2445086,194000,3493990,445197,1621993,169
2,Everything Everywhere All at Once,Science Fiction,IAC Films,USA,English,2022,4.3,2345715,130000,2908165,494949,1473174,140
3,Fight Club,Drama,Fox 2000 Pictures,Germany,English,1999,4.27,2568765,114000,3726744,415981,1719180,139
11,Whiplash,Drama,Bold Films,USA,English,2014,4.43,2140912,110000,2896085,376472,1371917,107
27,Eternal Sunshine of the Spotless Mind,Science Fiction,Focus Features,USA,English,2004,4.25,1577981,106000,2343738,356353,1013236,108
36,Little Women,Romance,Columbia Pictures,USA,English,2019,4.14,1445980,98000,2036087,352741,918540,135
59,Dead Poets Society,Drama,Touchstone Pictures,USA,English,1989,4.26,1165064,95000,1835170,262277,762244,128
1,Parasite,Comedy,Barunson E&A,South Korea,Korean,2019,4.56,2844133,90000,3801915,521089,2176138,133
16,The Dark Knight,Action,DC Comics,UK,English,2008,4.47,2262631,82000,3466661,461171,1557080,152


In [86]:
temp_df = df.sort_values(by=['Avg Rating'], ascending = False)
temp_df.head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
3408,Come and See,War,Belarusfilm,USSR,Russian,1985,4.64,243027,12000,309716,133644,128776,142
236,12 Angry Men,Drama,United Artists,USA,English,1957,4.62,634397,32000,929120,197288,365956,97
4540,Twin Peaks,TV Movie,Lynch/Frost Productions,USA,English,1989,4.6,128682,3100,221497,37511,91460,116
3441,Seven Samurai,Drama,TOHO,Japan,Japanese,1954,4.6,240244,9600,361879,151302,140259,207
5229,National Theatre Live: Fleabag,Comedy,DryWrite,UK,English,2019,4.6,98654,2200,145529,16598,63627,80
229,The Godfather: Part II,Crime,Paramount,USA,English,1974,4.59,703609,25000,1097384,218087,397401,202
1,Parasite,Comedy,Barunson E&A,South Korea,Korean,2019,4.56,2844133,90000,3801915,521089,2176138,133
946,The Human Condition III: A Soldier’s Prayer,History,Ninjin Club,Japan,Japanese,1961,4.56,15332,545,21593,22593,8966,190
66,The Shawshank Redemption,Crime,Castle Rock Entertainment,USA,English,1994,4.55,1329558,65000,2051940,252557,796083,142
45,The Godfather,Crime,Paramount,USA,English,1972,4.55,1317613,62000,2020443,331347,806768,175


In [87]:
temp_df = df.sort_values(by=['Avg Rating'])
temp_df.head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
2142,Dragonball Evolution,Thriller,World Film Magic,Hong Kong,English,2009,0.88,58311,86,89159,8076,2249,85
534,365 Days,Drama,Ekipa,Poland,Polish,2020,1.04,103343,123,159838,12556,5541,116
5117,The Last Airbender,Adventure,Paramount,USA,English,2010,1.07,184665,161,315695,25658,8710,103
5679,365 Days: This Day,Drama,Ekipa,Poland,Polish,2022,1.07,28808,15,41500,5491,1363,111
380,Winnie the Pooh: Blood and Honey,Horror,Jagged Edge Productions,UK,English,2023,1.1,75025,55,87257,24575,6134,84
1073,Slender Man,Horror,Madhouse Entertainment,USA,English,2018,1.18,75628,38,106970,15637,4029,93
5444,He’s All That,Comedy,Miramax,USA,English,2021,1.21,117501,73,168960,16418,7565,91
5345,The Emoji Movie,Animation,Columbia Pictures,USA,English,2017,1.22,153553,444,247965,21465,10191,86
589,The Kissing Booth 3,Romance,Komixx Entertainment,South Africa,English,2021,1.24,98285,57,159526,13628,7289,112
2307,Epic Movie,Comedy,Regency Enterprises,USA,English,2007,1.24,48716,48,92143,7812,3657,85


As we can see some the worst rated movies are the more "popular" movies because people have watched it but ended up not liking it and thus they have high watch counts but abysmal ratings

In [88]:
df[df['Country'] == 'Iran'].head(20).sort_index()

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
1324,"Life, and Nothing More…",Drama,Kanoon,Iran,Persian (Farsi),1992,4.24,22662,389,32201,20702,12781,95
1686,A Moment of Innocence,Drama,Pakhshiran,Iran,Persian (Farsi),1996,4.21,14028,489,20939,14813,8224,78
1830,Children of Heaven,Family,Kanoon,Iran,Persian (Farsi),1997,4.11,27088,553,41933,14444,12648,89
1974,The Wind Will Carry Us,Drama,MK2 Films,Iran,Persian (Farsi),1999,4.03,20736,340,31784,14222,9700,118
2197,Hit the Road,Comedy,JP Production,Iran,Persian (Farsi),2021,3.9,22291,316,28451,11276,9614,94
2614,Taxi,Comedy,Jafar Panahi Film Productions,Iran,Persian (Farsi),2015,3.8,20792,81,31723,10221,7685,82
4028,Ten,Drama,Abbas Kiarostami Productions,Iran,Persian (Farsi),2002,3.84,11119,93,17585,8500,4670,94
4078,Leila’s Brothers,Drama,Iris Film,Iran,Persian (Farsi),2022,3.42,16694,210,21335,4749,6372,170
4682,Close-Up,Drama,Kanoon,Iran,Persian (Farsi),1990,4.41,79355,3700,113884,59900,47133,98
4739,A Separation,Drama,Asghar Farhadi Productions,Iran,Persian (Farsi),2011,4.36,114826,2100,171752,58582,54697,123


In [89]:
df[df['Primary Language'] == 'Japanese'].head(10)

Unnamed: 0,Movie Name,Genre,Studio,Country,Primary Language,Release Year,Avg Rating,Raters,Fans,Watched,Lists,Likes,Duration (minutes)
29,Spirited Away,Family,Studio Ghibli,Japan,Japanese,2001,4.45,1808752,76000,2738830,375921,1288809,125
92,Howl’s Moving Castle,Fantasy,Studio Ghibli,Japan,Japanese,2004,4.32,1148541,72000,1727094,257402,766309,119
132,My Neighbor Totoro,Fantasy,Studio Ghibli,Japan,Japanese,1988,4.19,969736,16000,1566225,226523,597524,86
187,Princess Mononoke,Adventure,Studio Ghibli,Japan,Japanese,1997,4.37,785444,37000,1206093,217646,485612,134
190,The Boy and the Heron,Animation,Studio Ghibli,Japan,Japanese,2023,3.96,590577,3200,705674,169706,270033,124
207,Perfect Blue,Thriller,Madhouse,Japan,Japanese,1997,4.4,556200,31000,745396,192567,342202,82
259,Your Name.,Romance,CoMix Wave Films,Japan,Japanese,2016,4.21,747602,26000,1086497,157140,413824,106
338,Good Morning,Family,Shochiku,Japan,Japanese,1959,4.18,41080,714,57407,29772,23042,94
357,"Night Is Short, Walk on Girl",Romance,Science SARU,Japan,Japanese,2017,4.17,45743,2000,62339,25231,26632,93
413,Lady Snowblood,Action,Tokyo Eiga,Japan,Japanese,1973,4.06,44688,428,61183,30462,23062,97
