In [12]:
import pandas as pd
import numpy as np
import seaborn as sb
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from collections import namedtuple
import ydata_profiling as pandas_profiling
from IPython.display import display


In [4]:
df = pd.read_csv('Movies.csv', encoding='latin1')

In [5]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [None]:
report = pandas_profiling.ProfileReport(df)
display(report)

In [None]:
pip install ipywidgets


In [14]:
df.dropna(subset=df.columns[1:9], how='all', inplace=True)

In [15]:
df.drop_duplicates(subset=['Name', 'Year'], keep='first', inplace=True)

In [16]:
df.dropna(subset=df.columns[[1, 2, 4, 5]], how='all', inplace=True)

In [20]:
df['Year'] = df['Year'].str.replace(r'[()]', '', regex=True)
df['Duration'] = df['Duration'].str.replace(r' min', '', regex=True)

In [21]:
df = df[df['Year'] != '2022']

In [22]:
print(f"Cleaned dataset shape: {df.shape}")

Cleaned dataset shape: (15046, 10)


In [23]:

year_count = df['Year'].value_counts().reset_index()
year_count.columns = ['Year', 'Count']

In [25]:
fig = px.bar(year_count, x='Year', y='Count', text='Count', title='Number of Movies Released by Year')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(
    xaxis=dict(title='Year of Movie Release', titlefont_size=16),
    yaxis=dict(title='Count of Movies Released', titlefont_size=16, tickfont_size=14)
)
fig.show()

In [26]:
dummies = df['Genre'].str.get_dummies(', ')
df_genre = pd.concat([df, dummies], axis=1)

In [27]:
genre_columns = df_genre.columns[10:]  # Assuming genre columns start from the 11th column
genre_count_by_year = df_genre.groupby('Year')[genre_columns].sum().reset_index()

fig = go.Figure()
for genre in genre_columns:
    fig.add_trace(go.Scatter(x=genre_count_by_year['Year'], y=genre_count_by_year[genre],
                             mode='lines', name=genre))

fig.update_layout(
    title='Genre Trends Over the Years',
    xaxis=dict(title='Year', titlefont_size=16),
    yaxis=dict(title='Count', titlefont_size=16, tickfont_size=14),
    legend=dict(y=0, x=1.0, bgcolor='rgba(255, 255, 255, 0)', bordercolor='rgba(255, 255, 255, 0)')
)
fig.show()

In [28]:
actor_cols = ['Actor 1', 'Actor 2', 'Actor 3']
actor_df = pd.melt(df[['Year'] + actor_cols], id_vars=['Year'], value_vars=actor_cols, var_name='Actor', value_name='Movie_Count')
actor_df.dropna(subset=['Actor'], inplace=True)

In [29]:
top_20_actors = actor_df['Actor'].value_counts().head(20).index.tolist()
top_20_actor_df = actor_df[actor_df['Actor'].isin(top_20_actors)]

In [30]:
fig = px.strip(top_20_actor_df, x='Year', y='Actor', color='Actor', title='Top 20 Actors by Number of Movies Made Over the Years')
fig.update_layout(
    xaxis_tickfont_size=14,
    height=600
)
fig.show()

In [31]:
director_df = df[['Director', 'Year']].dropna()
director_df['Movie_Count'] = 1

In [32]:
top_20_directors = director_df['Director'].value_counts().head(20).index.tolist()
top_20_director_df = director_df[director_df['Director'].isin(top_20_directors)]