In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-movie-dataset-dec-2023/imdb_movie_data_2023.csv


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

In [None]:
def draw_bars(data_frame, xlabel, ylabel, title):
    fig = plt.figure(figsize=(14,5))
    ax = sns.barplot(data=data_frame, x=data_frame.columns[0], y=data_frame.columns[1])

    for p in ax.patches:
        ax.annotate(f'{p.get_height():,.0f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='center', fontsize=10, color='black', xytext=(0, 5 * (abs(p.get_height())/p.get_height())),
                    textcoords='offset points')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

In [None]:
df = pd.read_csv('/dataset/imdb_movie_data_2023.csv', index_col=[0])
df['Meta Score'] = df['Meta Score'] / 10

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
top_directors = df['Director'].value_counts()[:20]
top_directors = top_directors.reset_index(name='count')
df_top_directors = df[df['Director'].isin(top_directors['Director'])]

fig = plt.figure(figsize=(14,8))
ax = sns.barplot(data=top_directors, y='Director', x='count')

for p in ax.patches:
    ax.annotate(f'{p.get_width():,.0f}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='center', va='center', fontsize=10, color='black', xytext=(5 * (abs(p.get_width())/p.get_width()), 0),
                textcoords='offset points')
plt.title('Movie Count per Top Directors')

Text(0.5, 1.0, 'Movie Count per Top Directors')


In [None]:
mean_rating_per_director = df_top_directors.groupby('Director')['Rating'].mean().reset_index(name='mean_rating')
mean_rating_per_director = mean_rating_per_director.sort_values('mean_rating', ascending = False)

fig = plt.figure(figsize=(14,8))
ax = sns.barplot(data=mean_rating_per_director, y='Director', x='mean_rating')

for p in ax.patches:
    ax.annotate(f'{p.get_width():,.2f}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='center', va='center', fontsize=10, color='black', xytext=(15 * (abs(p.get_width())/p.get_width()), 0),
                textcoords='offset points')
plt.title('Mean Rating per Top Directors')

Text(0.5, 1.0, 'Mean Rating per Top Directors')


In [None]:
mean_votes_per_director = df_top_directors.groupby('Director')['Votes'].mean().reset_index(name='mean_votes')
mean_votes_per_director = mean_votes_per_director.sort_values('mean_votes', ascending = False)

fig = plt.figure(figsize=(14,8))
ax = sns.barplot(data=mean_votes_per_director, y='Director', x='mean_votes')

for p in ax.patches:
    ax.annotate(f'{p.get_width():,.0f}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='center', va='center', fontsize=10, color='black', xytext=(15 * (abs(p.get_width())/p.get_width()), 0),
                textcoords='offset points')
plt.title('Mean Votes per Top Directors')

Text(0.5, 1.0, 'Mean Votes per Top Directors')


In [None]:
top_movies = df[df['Rating'] > 8]
top_movies = top_movies.sort_values('Rating', ascending = False)
top_movies

In [None]:
actors_dict = dict()
df_no_cast_na = df.dropna(subset=['Cast'])
cast_list = df_no_cast_na['Cast'].tolist()
for actors in cast_list:
    for actor in actors.split(','):
        actor = actor.strip()
        if actor in actors_dict:
            actors_dict[actor] += 1
        else:
            actors_dict[actor] = 1
            
top_actors_df = pd.DataFrame(list(actors_dict.items()), columns=['actor_name', 'frequency'])

top_actors_df = top_actors_df.sort_values(by='frequency', ascending=False)

top_actors_df = top_actors_df.head(20)

fig = plt.figure(figsize=(14,8))
ax = sns.barplot(data=top_actors_df, y='actor_name', x='frequency')

for p in ax.patches:
    ax.annotate(f'{p.get_width():,.0f}', (p.get_width(), p.get_y() + p.get_height() / 2.),
                ha='center', va='center', fontsize=10, color='black', xytext=(15 * (abs(p.get_width())/p.get_width()), 0),
                textcoords='offset points')
plt.title('Top Actors by Movie Count')

Text(0.5, 1.0, 'Top Actors by Movie Count')


In [None]:
df['PG Rating'].value_counts()

In [None]:
sns.histplot(data = df, x = 'Year')
plt.title('Movies by Year distribution')

Text(0.5, 1.0, 'Movies by Year distribution')


In [None]:
df.head()

In [None]:
sns.scatterplot(data = df, x = 'Rating', y = 'Meta Score') plt.title('Meta Score by IMDB Rating')

In [None]:
mean_votes_per_director

Just checking if i missed a movie from my favorite directors


In [None]:
my_favorite_directors = ['Christopher Nolan', 'Quentin Tarantino','Guy Ritchie']
df[df['Director'].isin(my_favorite_directors)].sort_values('Rating', ascending = False)