In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
movie_data = pd.read_csv("/kaggle/input/imdb-india-movies/IMDb Movies India.csv", encoding='latin1')

In [3]:
movie_data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [5]:
for column in movie_data.columns:
    unique_values = movie_data[column].unique()
    print(f"Unique values for {column}:\n{unique_values}\n")


Unique values for Name:
[' ' '#Gadhvi (He thought he was Gandhi)' '#Homecoming' ... 'Zulmi Raj'
 'Zulmi Shikari' 'Zulm-O-Sitam']

Unique values for Year:
[nan '(2019)' '(2021)' '(2010)' '(1997)' '(2005)' '(2008)' '(2012)'
 '(2014)' '(2004)' '(2016)' '(1991)' '(1990)' '(2018)' '(1987)' '(1948)'
 '(1958)' '(2017)' '(2020)' '(2009)' '(2002)' '(1993)' '(1946)' '(1994)'
 '(2007)' '(2013)' '(2003)' '(1998)' '(1979)' '(1951)' '(1956)' '(1974)'
 '(2015)' '(2006)' '(1981)' '(1985)' '(2011)' '(2001)' '(1967)' '(1988)'
 '(1995)' '(1959)' '(1996)' '(1970)' '(1976)' '(2000)' '(1999)' '(1973)'
 '(1968)' '(1943)' '(1953)' '(1986)' '(1983)' '(1989)' '(1982)' '(1977)'
 '(1957)' '(1950)' '(1992)' '(1969)' '(1975)' '(1947)' '(1972)' '(1971)'
 '(1935)' '(1978)' '(1960)' '(1944)' '(1963)' '(1940)' '(1984)' '(1934)'
 '(1955)' '(1936)' '(1980)' '(1966)' '(1949)' '(1962)' '(1964)' '(1952)'
 '(1933)' '(1942)' '(1939)' '(1954)' '(1945)' '(1961)' '(1965)' '(1938)'
 '(1941)' '(1931)' '(1937)' '(2022)' '(1932)' '(

In [15]:
top_10_movies = movie_data.sort_values(by='Rating', ascending=False).head(10)

# Create a bar plot for the top 10 movies
fig = px.bar(top_10_movies, x='Genre', y='Rating', title='Top 10 Movies by Genre and Rating',
             labels={'Rating': 'Movie Rating', 'Genre': 'Movie Genre'},
             color='Rating', color_continuous_scale='Viridis',
             hover_data=['Director', 'Actor 1', 'Actor 2', 'Actor 3'])

fig.show()

In [9]:
import plotly.express as px

# Assuming your DataFrame is named 'movie_data'
# Let's focus on the relevant columns for visualization
columns_of_interest = ['Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
subset_data = movie_data[columns_of_interest]

# Pairwise scatter plots for numerical columns
#fig = px.scatter_matrix(subset_data, dimensions=['Duration', 'Rating', 'Votes'], color='Genre')
#fig.update_layout(title='Pairwise Scatter Plots for Numerical Columns')
#fig.show()

# Bar plots for categorical columns
for column in ['Year']:
    fig = px.bar(subset_data, x=column, y='Rating', title=f'Relationship between {column} and Rating')
    fig.show()

# Scatter plot for 'Votes'
fig = px.scatter(subset_data, x='Votes', y='Rating', title='Scatter Plot: Votes vs Rating')
fig.show()


In [16]:
movie_data['Name'] = movie_data['Name'].str.extract('([a-zA-Z\s]+)', expand=False)
movie_data['Votes'] = pd.to_numeric(movie_data['Votes'], errors='coerce')
movie_data['Duration'] = pd.to_numeric(movie_data['Duration'].str.replace('min', ''), errors='coerce')
movie_data['Year'] = pd.to_datetime(movie_data['Year'].str.strip('()'), errors='coerce')

In [17]:
duplicate_rows = movie_data[movie_data.duplicated()]
duplicate_rows

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1250,Arab Ka Sona,1979-01-01,,Action,,,Master Bhagwan,Meena Rai,Dara Singh,
1769,Balidan,1992-01-01,,Drama,,,,,,
4723,First Time,2009-01-01,,,,,Raja Bundela,Zeenat Aman,Nitin Arora,Raj Babbar
8819,Mangal Ho,NaT,,Comedy,,,Pritish Chakraborty,Pritish Chakraborty,,
9713,Musafir,NaT,,Thriller,,,Shiva Dagar,,,
13069,Shivani,2019-01-01,,Crime,,,Ugresh Prasad Ujala,Santosh,,
13308,Slumdog Karodpati,2019-01-01,118.0,Thriller,,,Rajesh Patole,Udhav Garje,Rahul Gavane,Govindrao


In [18]:
# Handle missing values
movie_data['Votes'].fillna(movie_data['Votes'].mean(), inplace=True)
movie_data['Duration'].fillna(movie_data['Duration'].median(), inplace=True)


In [19]:
# Calculate mean ratings for each category
genre_mean_rating = movie_data.groupby('Genre')['Rating'].transform('mean')
director_mean_rating = movie_data.groupby('Director')['Rating'].transform('mean')
actor1_mean_rating = movie_data.groupby('Actor 1')['Rating'].transform('mean')
actor2_mean_rating = movie_data.groupby('Actor 2')['Rating'].transform('mean')
actor3_mean_rating = movie_data.groupby('Actor 3')['Rating'].transform('mean')

# Encode mean ratings into the original DataFrame with some differences
movie_data['Genre_mean_rating'] = genre_mean_rating
movie_data['Director_mean_rating'] = director_mean_rating
movie_data['Actor1_mean_rating'] = actor1_mean_rating
movie_data['Actor2_mean_rating'] = actor2_mean_rating
movie_data['Actor3_mean_rating'] = actor3_mean_rating

# Fill NaN values in new columns with the overall mean rating or any other suitable value
mean_rating_fill_value = movie_data['Rating'].mean()
movie_data['Genre_mean_rating'].fillna(mean_rating_fill_value, inplace=True)
movie_data['Director_mean_rating'].fillna(mean_rating_fill_value, inplace=True)
movie_data['Actor1_mean_rating'].fillna(mean_rating_fill_value, inplace=True)
movie_data['Actor2_mean_rating'].fillna(mean_rating_fill_value, inplace=True)
movie_data['Actor3_mean_rating'].fillna(mean_rating_fill_value, inplace=True)


In [20]:
movie_data = movie_data.drop(['Name','Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], axis = 1)

In [21]:
movie_data.dropna(subset=['Rating'], inplace=True)

In [22]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7919 entries, 1 to 15508
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Year                  7919 non-null   datetime64[ns]
 1   Duration              7919 non-null   float64       
 2   Rating                7919 non-null   float64       
 3   Votes                 7919 non-null   float64       
 4   Genre_mean_rating     7919 non-null   float64       
 5   Director_mean_rating  7919 non-null   float64       
 6   Actor1_mean_rating    7919 non-null   float64       
 7   Actor2_mean_rating    7919 non-null   float64       
 8   Actor3_mean_rating    7919 non-null   float64       
dtypes: datetime64[ns](1), float64(8)
memory usage: 618.7 KB


In [23]:
# Assuming your DataFrame is named 'data'
X = movie_data.drop(['Rating', 'Year'], axis=1)  # Remove 'Year' as it is not numeric
y = movie_data['Rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)



# Predictions
rf_preds = rf_model.predict(X_test)


In [25]:
# Evaluate Random Forest model
rf_mse = mean_squared_error(y_test, rf_preds)
rf_mae = mean_absolute_error(y_test, rf_preds)
rf_r2 = r2_score(y_test, rf_preds)

In [26]:
# Display results
print("Random Forest Metrics:")
print("MSE:", rf_mse)
print("MAE:", rf_mae)
print("R2:", rf_r2)


Random Forest Metrics:
MSE: 0.4071709071101641
MAE: 0.4262225273569024
R2: 0.7809902528807179


In [29]:
fig = px.scatter(x=y_test, y=rf_preds, labels={'x': 'Actual Rates', 'y': 'Predicted Rates'},
                 title='Actual vs Predicted Rates')
fig.show()

In [30]:
# Initialize the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)

In [32]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.457342811364929
R^2 Score: 0.7540037077434054


In [33]:
fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Rates', 'y': 'Predicted Rates'},
                 title='Actual vs Predicted Rates')
fig.show()
