In [46]:
#import  libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px

In [47]:
trm_df = pd.read_csv("top-rated-movies.csv")

In [48]:
trm_df.head(5)

Unnamed: 0.1,Unnamed: 0,id,original_language,original_title,title,overview,release_date,vote_average,vote_count,popularity
0,0,238,en,The Godfather,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,8.7,18805,116.429
1,1,278,en,The Shawshank Redemption,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,8.7,24783,105.427
2,2,240,en,The Godfather Part II,The Godfather Part II,In the continuing saga of the Corleone crime f...,1974-12-20,8.6,11367,68.6
3,3,424,en,Schindler's List,Schindler's List,The true story of how businessman Oskar Schind...,1993-12-15,8.6,14686,72.058
4,4,19404,hi,दिलवाले दुल्हनिया ले जायेंगे,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20,8.6,4266,34.449


In [49]:
trm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9980 entries, 0 to 9979
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         9980 non-null   int64  
 1   id                 9980 non-null   int64  
 2   original_language  9980 non-null   object 
 3   original_title     9980 non-null   object 
 4   title              9980 non-null   object 
 5   overview           9978 non-null   object 
 6   release_date       9980 non-null   object 
 7   vote_average       9980 non-null   float64
 8   vote_count         9980 non-null   int64  
 9   popularity         9980 non-null   float64
dtypes: float64(2), int64(3), object(5)
memory usage: 779.8+ KB


In [50]:
trm_df.isnull().sum()

Unnamed: 0           0
id                   0
original_language    0
original_title       0
title                0
overview             2
release_date         0
vote_average         0
vote_count           0
popularity           0
dtype: int64

In [51]:
# Convert 'release_date' to datetime format if it's not already
trm_df['release_date'] = pd.to_datetime(trm_df['release_date'], errors='coerce')

In [52]:
#drop unwanted columns in df
trm_df.drop(columns=['Unnamed: 0'], inplace=True)

In [53]:
# fill null values in DataFrame
trm_df['overview'].fillna('No overview available', inplace=True)


In [54]:
# Top 10 Movies by Vote Average
top10_vote_average = trm_df.sort_values(by='vote_average', ascending=False).head(20)
fig_top20 = px.bar(top10_vote_average,
                   x='title',
                   y='vote_average',
                   title='Top 20 Movies by Vote Average',
                   hover_data=['vote_average'],
                   color='vote_average', 
                   color_continuous_scale='Bluered',
                   width=900,
                   height=600)
fig_top20.show()


In [55]:
# Top 20 Movies: Vote Count vs Popularity
top_20 = trm_df.sort_values(by='vote_count', ascending=False).head(20)
fig_scatter = px.scatter(top_20,
                         x='vote_count',
                         y='popularity',
                         title='Top 20 Movies: Vote Count vs Popularity',
                         labels={'vote_count': 'Vote Count', 'popularity': 'Popularity'},
                         hover_name='title',
                         width=900,
                         height=600)
fig_scatter.update_traces(hovertemplate="Movie: %{hovertext}<br>Vote Count: %{x}<br>Popularity: %{y}")
fig_scatter.show()


In [87]:
# Filter movies released between 2000 and 2015
filtered_df = trm_df[(trm_df['release_date'].dt.year >= 2000) & (trm_df['release_date'].dt.year <= 2015)]
top_20_vote_average = filtered_df.sort_values(by='vote_average', ascending=False).head(20)
fig_top20 = px.bar(top_20_vote_average,
                   x='title',
                   y='vote_average',
                   title='Top 20 Movies by Vote Average (Released between 2000 and 2015)',
                   hover_data=['vote_average', 'release_date'],
                   color='vote_average', 
                   color_continuous_scale='RdYlBu',
                   width=900,
                   height=600)

fig_top20.show()

In [75]:
# Language distribution pie chart
language_counts = trm_df['original_language'].value_counts()
fig_pie = px.pie(language_counts,
                 values=language_counts.values,
                 names=language_counts.index,
                 title='Distribution of Movies by Original Language')

fig_pie.update_traces(textposition='inside', textinfo='percent+label')
fig_pie.update_layout(width=1000, height=600)

fig_pie.show()


In [74]:
# Distribution of Vote Average by Original Language
fig_box = px.box(trm_df,
                 x='original_language',
                 y='vote_average',
                 title='Distribution of Vote Average by Original Language',
                 hover_name='title',
                 color='original_language', 
                 points='all',  
                 labels={'original_language': 'Original Language', 'vote_average': 'Vote Average'})

fig_box.update_layout(xaxis_title='Original Language', yaxis_title='Vote Average')
fig_box.show()


In [83]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures

In [76]:
#  Linear regression
X = trm_df[['vote_count', 'popularity']]
y = trm_df['vote_average']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Example prediction for a new movie
new_movie = [[10000, 80]]  # Example values for vote_count and popularity
predicted_vote_average = model.predict(new_movie)
print(f'Predicted Vote Average for New Movie: {predicted_vote_average[0]}')


Mean Squared Error: 0.3728461183830983
Predicted Vote Average for New Movie: 7.2594959863362245


In [79]:
#  Decision  regression
X = trm_df[['vote_count', 'popularity']]
y = trm_df['vote_average']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Example prediction for a new movie
new_movie = [[10000, 80]] 
predicted_vote_average = model.predict(new_movie)
print(f'Predicted Vote Average for New Movie: {predicted_vote_average[0]}')


Mean Squared Error: 0.7437725450901803
Predicted Vote Average for New Movie: 7.0


In [86]:
#  Polynomial regression
X = trm_df[['vote_count', 'popularity']].values
y = trm_df['vote_average'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=2) 
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)

y_pred_poly = model_poly.predict(X_test_poly)

mse_poly = mean_squared_error(y_test, y_pred_poly)
print(f'Mean Squared Error (Polynomial Regression): {mse_poly}')

# Example prediction for a new movie
new_movie = [[10000, 80]] 
new_movie_poly = poly.transform(new_movie)
predicted_vote_average_poly = model_poly.predict(new_movie_poly)
print(f'Predicted Vote Average for New Movie (Polynomial Regression): {predicted_vote_average_poly[0]}')


Mean Squared Error (Polynomial Regression): 0.3713630750591436
Predicted Vote Average for New Movie (Polynomial Regression): 7.258727535593045
