In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('movies.csv')
print("Data Loaded Successfully!")
df.head()

In [None]:
print("\nDataset Info:")
(df.info())

In [None]:
df.shape

In [None]:
print("\nStatistical Summary:")
(df.describe())

In [None]:
print("\nOverall Statistics About The DataFrame")
df.describe(include='all')

In [None]:
df.columns

In [None]:
print("\nTitles of Movies Having Runtime Greater Than or Equal to 180 Minutes")
df[df['Runtime (Minutes)']>=180]['Title']

In [None]:
df.columns

In [None]:
print("\nYear With The Highest Average Voting")
df.groupby('Year')['Votes'].mean().sort_values(ascending=False)

In [None]:
print("\nYear With The Highest Average Voting")
sns.barplot(x='Year',y='Votes',data=df)
plt.show()

In [None]:
print("\nYear With The Highest Average Revenue")
df.groupby('Year')['Revenue (Millions)'].mean().sort_values(ascending=False)

In [None]:
print("\nYear With The Highest Average Voting")
sns.barplot(x='Year',y='Revenue (Millions)',data=df)
plt.show()

In [None]:
print("\nYear With The Highest Average Revenue")
df.groupby('Year')['Revenue (Millions)'].mean().idxmax()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Year', y='Revenue (Millions)', data=df.groupby('Year')['Revenue (Millions)'].mean().reset_index())
plt.title('Year With The Highest Average Revenue')
plt.xlabel('Year')
plt.ylabel('Average Revenue (Millions)')
plt.xticks(rotation=90)
plt.show()

In [None]:
print("\nAverage Rating For Each Director")
df.groupby('Director')['Rating'].mean()

In [None]:
print("\nTop 10 Lengthy Movies Titles and Runtimes")
df.nlargest(10, 'Runtime (Minutes)')[['Title', 'Runtime (Minutes)']]

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Runtime (Minutes)', y='Title', data=df.nlargest(10, 'Runtime (Minutes)'))
plt.title('Top 10 Lengthy Movies')
plt.xlabel('Runtime (Minutes)')
plt.ylabel('Movie Title')
plt.show()

In [None]:
print("\nNumber of Movies Per Year")
df['Year'].value_counts()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='Year', data=df)
plt.title('Number of Movies Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.xticks(rotation=90)
plt.show()

In [None]:
print("\nMost Popular Movie Title (Highest Revenue)")
df[df['Revenue (Millions)'] == df['Revenue (Millions)'].max()]['Title']

In [None]:
print("\nTop 10 Highest Rated Movie Titles And Their Directors")
df.nlargest(10, 'Rating')[['Title', 'Director', 'Rating']]

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Rating', y='Title', data=df.nlargest(10, 'Rating'))
plt.title('Top 10 Highest Rated Movies')
plt.xlabel('Rating')
plt.ylabel('Movie Title')
plt.show()

In [None]:
print("\nAverage Rating of Movies Year Wise")
df.groupby('Year')['Rating'].mean()

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(x='Year', y='Rating', data=df.groupby('Year')['Rating'].mean().reset_index())
plt.title('Average Rating of Movies Year Wise')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.xticks(rotation=90)
plt.show()

In [None]:
print("\nCorrelation Between Rating and Revenue")
df[['Rating', 'Revenue (Millions)']].corr()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Rating', y='Revenue (Millions)', data=df)
plt.title('Rating vs Revenue')
plt.xlabel('Rating')
plt.ylabel('Revenue (Millions)')
plt.show()

In [None]:
def classify_rating(rating):
    if rating >= 8.0:
        return 'Excellent'
    elif rating >= 5.0:
        return 'Good'
    else:
        return 'Average'

df['rating_category'] = df['Rating'].apply(classify_rating)
print("\nClassification of Movies Based on Ratings")
df['rating_category'].value_counts()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='rating_category', data=df)
plt.title('Classification of Movies Based on Ratings')
plt.xlabel('Rating Category')
plt.ylabel('Number of Movies')
plt.show()

In [None]:
print("\nNumber of Films of Each Genre")
df['Genre'].value_counts()