In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
data = '/content/drive/MyDrive/IMDb Movies India.csv'

In [5]:
dataset = pd.read_csv(data, encoding='latin-1')

In [6]:
print(dataset)

                                     Name    Year Duration            Genre  \
0                                             NaN      NaN            Drama   
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                             #Homecoming  (2021)   90 min   Drama, Musical   
3                                 #Yaaram  (2019)  110 min  Comedy, Romance   
4                       ...And Once Again  (2010)  105 min            Drama   
...                                   ...     ...      ...              ...   
15504                 Zulm Ko Jala Doonga  (1988)      NaN           Action   
15505                               Zulmi  (1999)  129 min    Action, Drama   
15506                           Zulmi Raj  (2005)      NaN           Action   
15507                       Zulmi Shikari  (1988)      NaN           Action   
15508                        Zulm-O-Sitam  (1998)  130 min    Action, Drama   

       Rating Votes            Director           A

In [7]:
dataset.shape

(15509, 10)

In [8]:
dataset.isnull()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,False,True,True,False,True,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,True,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
15504,False,False,True,False,False,False,False,False,False,False
15505,False,False,False,False,False,False,False,False,False,False
15506,False,False,True,False,True,True,False,False,True,True
15507,False,False,True,False,True,True,True,True,True,True


In [9]:
dataset.isnull().sum()

Unnamed: 0,0
Name,0
Year,528
Duration,8269
Genre,1877
Rating,7590
Votes,7589
Director,525
Actor 1,1617
Actor 2,2384
Actor 3,3144


In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [11]:
dataset.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
15504,False
15505,False
15506,False
15507,False


In [12]:
dataset.duplicated().sum()

6

In [13]:
dataset.dropna(inplace= True)

In [14]:
dataset.shape

(5659, 10)

In [15]:
dataset.isnull()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
15493,False,False,False,False,False,False,False,False,False,False
15494,False,False,False,False,False,False,False,False,False,False
15503,False,False,False,False,False,False,False,False,False,False
15505,False,False,False,False,False,False,False,False,False,False


In [16]:
dataset.isnull().sum()

Unnamed: 0,0
Name,0
Year,0
Duration,0
Genre,0
Rating,0
Votes,0
Director,0
Actor 1,0
Actor 2,0
Actor 3,0


In [17]:
dataset.drop_duplicates(inplace=True)

In [18]:
dataset.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

## **Data** **Preprocessing**

In [19]:
dataset['Year'] = dataset['Year'].str.replace(r'[()]','',regex=True).astype(int)

In [20]:
dataset['Duration'] = dataset['Duration'].str.replace(r'[min]','',regex=True).astype(int)


In [21]:
# Convert 'Genre' column to string type before applying string methods
dataset['Genre'] = dataset['Genre'].astype(str)

# Now you can split the 'Genre' column
dataset['Genre'] = dataset['Genre'].str.split(',')
dataset = dataset.explode('Genre')

# Check if the 'Genre' Series is empty after explode
if not dataset['Genre'].empty:
    dataset['Genre'].fillna(dataset['Genre'].mode()[0], inplace= True)
else:
    # Handle the case where 'Genre' Series is empty, maybe fill with a default value
    dataset['Genre'].fillna('Unknown', inplace=True)

In [22]:
# Remove commas from 'Votes' column and convert to integer
dataset['Votes'] = dataset['Votes'].str.replace(',', '').astype(int)

In [23]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11979 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      11979 non-null  object 
 1   Year      11979 non-null  int64  
 2   Duration  11979 non-null  int64  
 3   Genre     11979 non-null  object 
 4   Rating    11979 non-null  float64
 5   Votes     11979 non-null  int64  
 6   Director  11979 non-null  object 
 7   Actor 1   11979 non-null  object 
 8   Actor 2   11979 non-null  object 
 9   Actor 3   11979 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 1.0+ MB


# **Data Visualization**

In [24]:
import matplotlib.pyplot as plt
import plotly.express as px

In [25]:
year = px.histogram(dataset, x='Year', histnorm='probability density', nbins = 30, title='distribution of movies by year')
year.show()

In [26]:
avg_rating_by_year = dataset.groupby(['Year','Genre'])['Rating'].mean().reset_index()
top_avg = dataset['Genre'].value_counts().head(10).index
avg_rating_by_year = avg_rating_by_year[avg_rating_by_year['Genre'].isin(top_avg)]
fig = px.line(avg_rating_by_year, x='Year', y='Rating', color='Genre') # create the figure and assign it to the variable fig
fig.update_layout(title='Average Rating by Year for Top Genre', xaxis_title='Year', yaxis_title='Average Rating')
fig.show()


In [27]:
year = px.histogram(dataset, x='Rating', histnorm='probability density', nbins = 40, title='distribution of movies by year')
year.show()

In [28]:
year = px.histogram(dataset, x='Votes', histnorm='probability density', nbins = 40, title='distribution of movies by year')
year.show()

## **Feature Engineering**

In [29]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score

In [30]:
dataset.drop('Name', axis = 1, inplace = True)

In [31]:
genre_mean_rating = dataset.groupby('Genre')['Rating'].transform('mean')
dataset['Genre_encoded'] = genre_mean_rating

In [32]:

director_mean_rating = dataset.groupby('Director')['Rating'].transform('mean')
dataset['Director_encoded'] = director_mean_rating

In [33]:

actor1_mean_rating = dataset.groupby('Actor 1')['Rating'].transform('mean')
dataset['Actor1_encoded'] = actor1_mean_rating

In [34]:

actor2_mean_rating = dataset.groupby('Actor 2')['Rating'].transform('mean')
dataset['Actor2_encoded'] = actor2_mean_rating

In [35]:
actor3_mean_rating = dataset.groupby('Actor 3')['Rating'].transform('mean')
dataset['Actor3_encoded'] = actor3_mean_rating


In [36]:
#keeping predictor and target value
x = dataset.drop(['Rating'], axis=1)
y = dataset['Rating']

In [37]:
from sklearn.preprocessing import OneHotEncoder

In [38]:
# Identify categorical columns (replace with actual categorical column names)
categorical_cols = ['Actor 1', 'Actor 2', 'Actor 3', 'Genre', 'Director']

# Apply one-hot encoding to categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(x[categorical_cols])

# Create a new DataFrame with encoded features, and use the original
# dataframe's index to avoid duplicate indices.
encoded_df = pd.DataFrame(encoded_features, index=x.index)

# Concatenate DataFrames
x = pd.concat([x.drop(categorical_cols, axis=1), encoded_df], axis=1)

In [39]:
#spitting data set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## **Model** **Building**

In [40]:
# Convert all column names to strings
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

Model = LinearRegression()
Model.fit(X_train,y_train)
model_pred = Model.predict(X_test)

In [41]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, model_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, model_pred, squared=False)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, model_pred)

# Calculate R-squared (R2)
r2 = r2_score(y_test, model_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 81293057024.06339
Root Mean Squared Error (RMSE): 285119.37328786234
Mean Absolute Error (MAE): 19580.20737274373
R-squared (R2): -42942998796.10678


## **model testing**

In [42]:
x.head(5)

Unnamed: 0,Year,Duration,Votes,Genre_encoded,Director_encoded,Actor1_encoded,Actor2_encoded,Actor3_encoded,0,1,...,9298,9299,9300,9301,9302,9303,9304,9305,9306,9307
1,2019,109,8,6.248697,7.0,6.85,7.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019,110,35,5.838423,4.4,5.25,4.4,4.46,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019,110,35,5.838739,4.4,5.25,4.4,4.46,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1997,147,827,5.838423,5.335135,4.793617,5.73,5.93,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1997,147,827,5.875793,5.335135,4.793617,5.73,5.93,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
y.head(5)

Unnamed: 0,Rating
1,7.0
3,4.4
3,4.4
5,4.7
5,4.7


In [44]:
import joblib


In [46]:
joblib.dump(Model, 'model.pkl')

['model.pkl']