In [1]:
import pandas as pd
import numpy as np

Reading Dataframe


In [2]:
import chardet

# Detect the file encoding
with open('movies.csv', 'rb') as file:
    raw_data = file.read()
    result = chardet.detect(raw_data)
    encoding = result['encoding']
    print(f"Detected encoding: {encoding}")

# Read the file content with the detected encoding
df = pd.read_csv('movies.csv', encoding=encoding)

Detected encoding: ISO-8859-1


In [3]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,-1988.0,,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,-1999.0,129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,-2005.0,,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,-1988.0,,Action,,,,,,


In [4]:
df.shape

(15509, 10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  float64
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(2), object(8)
memory usage: 1.2+ MB


Handling DataTypes


In [6]:
df['Year']= abs(df['Year'])

In [7]:
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

In [8]:
df['Votes'] =df['Votes'].str.replace(',','')

In [9]:
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

In [10]:
df.isna().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7590
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [11]:
df['Duration'] = df['Duration'].str.replace(' min','')

In [12]:
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

In [13]:
df['Duration'].fillna(df['Duration'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Duration'].fillna(df['Duration'].median(), inplace=True)


In [14]:
df['Votes'] = df['Votes'].fillna(df['Votes'].median())

In [15]:
df['Rating'].fillna(df['Rating'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rating'].fillna(df['Rating'].median(), inplace=True)


Handling Null Values

In [16]:
df.dropna(inplace=True)

In [17]:
df.isna().sum()

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [18]:
df.shape

(11086, 10)

In [19]:
df.drop(columns=['Name', "Actor 3"],inplace=True)

In [20]:
df['Year'] = df['Year'].astype(int)
df['Duration'] = df['Duration'].astype(int)
df['Votes'] = df['Votes'].astype(int)
df['Rating'] = df['Rating'].astype(float)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11086 entries, 1 to 15508
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      11086 non-null  int32  
 1   Duration  11086 non-null  int32  
 2   Genre     11086 non-null  object 
 3   Rating    11086 non-null  float64
 4   Votes     11086 non-null  int32  
 5   Director  11086 non-null  object 
 6   Actor 1   11086 non-null  object 
 7   Actor 2   11086 non-null  object 
dtypes: float64(1), int32(3), object(4)
memory usage: 649.6+ KB


Splitting Genre colums using ordinal encoding

In [22]:
def split_genre_column(MovieData,Genre):

    MovieData['Genre1'] = MovieData[Genre].str.split(',', expand=True)[0]
    MovieData['Genre2'] = MovieData[Genre].str.split(',', expand=True)[1]
    MovieData['Genre3'] = MovieData[Genre].str.split(',', expand=True)[2]
    return MovieData

split_genre_column(df,'Genre')

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,Genre3
1,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Drama,,
2,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Drama,Musical,
3,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Comedy,Romance,
4,2010,105,Drama,6.0,55,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Drama,,
5,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Comedy,Drama,Musical
...,...,...,...,...,...,...,...,...,...,...,...
15502,1979,131,Action,6.0,55,Dinesh-Ramanesh,Ramesh Puri,Jalal Agha,Action,,
15503,1989,125,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Action,Crime,Drama
15504,1988,131,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Action,,
15505,1999,129,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Action,Drama,


In [23]:
G=['Genre1','Genre2','Genre3']
for x in G:
    df[x],_ = pd.factorize(df[x])
    
MovieData = df.drop(columns=['Genre'])
df.head(3)

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,Genre3
1,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,0,-1,-1
2,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,0,0,-1
3,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,1,1,-1


In [24]:
df.isna().sum()

Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Genre1      0
Genre2      0
Genre3      0
dtype: int64

In [25]:
df.fillna(0,inplace=True)

In [26]:
df.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,Genre3
1,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,0,-1,-1
2,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,0,0,-1
3,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,1,1,-1
4,2010,105,Drama,6.0,55,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,0,-1,-1
5,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,1,2,0


In [27]:
df.to_csv('movies_cleaned.csv', index=False)

Preparing columns to add in the ML model 

In [28]:
newdf = pd.read_csv('movies_cleaned.csv')

In [29]:
newdf = pd.DataFrame(newdf)

In [30]:
newdf.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,Genre3
0,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,0,-1,-1
1,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,0,0,-1
2,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,1,1,-1
3,2010,105,Drama,6.0,55,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,0,-1,-1
4,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,1,2,0


In [31]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11086 entries, 0 to 11085
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      11086 non-null  int64  
 1   Duration  11086 non-null  int64  
 2   Genre     11086 non-null  object 
 3   Rating    11086 non-null  float64
 4   Votes     11086 non-null  int64  
 5   Director  11086 non-null  object 
 6   Actor 1   11086 non-null  object 
 7   Actor 2   11086 non-null  object 
 8   Genre1    11086 non-null  int64  
 9   Genre2    11086 non-null  int64  
 10  Genre3    11086 non-null  int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 952.8+ KB


In [32]:
newdf['MovieAge'] = 2024 - newdf['Year']
newdf.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,Genre3,MovieAge
0,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,0,-1,-1,5
1,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,0,0,-1,3
2,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,1,1,-1,5
3,2010,105,Drama,6.0,55,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,0,-1,-1,14
4,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,1,2,0,27


In [33]:
DirectorCounts = newdf['Director'].value_counts()
newdf['DirectorCount'] = newdf['Director'].map(DirectorCounts)
Actorcounts = newdf['Actor 1'].value_counts()
newdf['Actor1Count'] = newdf['Actor 1'].map(Actorcounts)
Actor2counts = newdf['Actor 2'].value_counts()
newdf['Actor2Count'] = newdf['Actor 2'].map(Actor2counts)

In [34]:
newdf.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,Genre3,MovieAge,DirectorCount,Actor1Count,Actor2Count
0,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,0,-1,-1,5,1,2,1
1,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,0,0,-1,3,1,2,3
2,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,1,1,-1,5,1,5,1
3,2010,105,Drama,6.0,55,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,0,-1,-1,14,7,6,9
4,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,1,2,0,27,17,18,15


In [35]:
newdf['LoggedVotes'] = np.log(newdf['Votes'])
newdf.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,Genre3,MovieAge,DirectorCount,Actor1Count,Actor2Count,LoggedVotes
0,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,0,-1,-1,5,1,2,1,2.079442
1,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,0,0,-1,3,1,2,3,4.007333
2,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,1,1,-1,5,1,5,1,3.555348
3,2010,105,Drama,6.0,55,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,0,-1,-1,14,7,6,9,4.007333
4,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,1,2,0,27,17,18,15,6.717805


In [36]:
DirectorRating = newdf.groupby('Director')['Rating'].mean()
newdf['DirectorRating'] = newdf['Director'].map(DirectorRating)

Actor1Rating = newdf.groupby('Actor 1')['Rating'].mean()
newdf['Actor1Rating'] = newdf['Actor 1'].map(Actor1Rating)

Actor2Rating = newdf.groupby('Actor 2')['Rating'].mean()
newdf['Actor2Rating'] = newdf['Actor 2'].map(Actor2Rating)


In [37]:
newdf.head()

Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,Genre3,MovieAge,DirectorCount,Actor1Count,Actor2Count,LoggedVotes,DirectorRating,Actor1Rating,Actor2Rating
0,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,0,-1,-1,5,1,2,1,2.079442,7.0,6.85,7.0
1,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,0,0,-1,3,1,2,3,4.007333,6.0,6.5,6.9
2,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,1,1,-1,5,1,5,1,3.555348,4.4,5.42,4.4
3,2010,105,Drama,6.0,55,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,0,-1,-1,14,7,6,9,4.007333,6.285714,6.833333,5.8
4,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,1,2,0,27,17,18,15,6.717805,5.358824,4.788889,5.786667


In [38]:
newdf["Genre1Encoded"] = newdf.groupby('Genre1')['Rating'].transform('mean').round(1)
newdf['Genre2Encoded'] = newdf.groupby('Genre2')['Rating'].transform('mean').round(1)
newdf['Genre3Encoded'] = newdf.groupby('Genre3')['Rating'].transform('mean').round(1)
newdf['VotesEncoded'] = newdf.groupby('Votes')['Rating'].transform('mean').round(1)
newdf['DirectorEncoded'] = newdf.groupby('Director')['Rating'].transform('mean').round(1)
newdf['Actor1Encoded'] = newdf.groupby('Actor 1')['Rating'].transform('mean').round(1)
newdf['Actor2Encoded'] = newdf.groupby('Actor 2')['Rating'].transform('mean').round(1)
newdf.head()


Unnamed: 0,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Genre1,Genre2,...,DirectorRating,Actor1Rating,Actor2Rating,Genre1Encoded,Genre2Encoded,Genre3Encoded,VotesEncoded,DirectorEncoded,Actor1Encoded,Actor2Encoded
0,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,0,-1,...,7.0,6.85,7.0,6.1,5.9,5.9,5.5,7.0,6.8,7.0
1,2021,90,"Drama, Musical",6.0,55,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,0,0,...,6.0,6.5,6.9,6.1,6.3,5.9,6.0,6.0,6.5,6.9
2,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,1,1,...,4.4,5.42,4.4,5.8,5.8,5.9,5.8,4.4,5.4,4.4
3,2010,105,Drama,6.0,55,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,0,-1,...,6.285714,6.833333,5.8,6.1,5.9,5.9,6.0,6.3,6.8,5.8
4,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,1,2,...,5.358824,4.788889,5.786667,5.8,5.9,6.0,4.8,5.4,4.8,5.8


In [39]:
newdf.columns

Index(['Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1',
       'Actor 2', 'Genre1', 'Genre2', 'Genre3', 'MovieAge', 'DirectorCount',
       'Actor1Count', 'Actor2Count', 'LoggedVotes', 'DirectorRating',
       'Actor1Rating', 'Actor2Rating', 'Genre1Encoded', 'Genre2Encoded',
       'Genre3Encoded', 'VotesEncoded', 'DirectorEncoded', 'Actor1Encoded',
       'Actor2Encoded'],
      dtype='object')

In [40]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11086 entries, 0 to 11085
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             11086 non-null  int64  
 1   Duration         11086 non-null  int64  
 2   Genre            11086 non-null  object 
 3   Rating           11086 non-null  float64
 4   Votes            11086 non-null  int64  
 5   Director         11086 non-null  object 
 6   Actor 1          11086 non-null  object 
 7   Actor 2          11086 non-null  object 
 8   Genre1           11086 non-null  int64  
 9   Genre2           11086 non-null  int64  
 10  Genre3           11086 non-null  int64  
 11  MovieAge         11086 non-null  int64  
 12  DirectorCount    11086 non-null  int64  
 13  Actor1Count      11086 non-null  int64  
 14  Actor2Count      11086 non-null  int64  
 15  LoggedVotes      11086 non-null  float64
 16  DirectorRating   11086 non-null  float64
 17  Actor1Rating

In [41]:
newdf.columns

Index(['Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1',
       'Actor 2', 'Genre1', 'Genre2', 'Genre3', 'MovieAge', 'DirectorCount',
       'Actor1Count', 'Actor2Count', 'LoggedVotes', 'DirectorRating',
       'Actor1Rating', 'Actor2Rating', 'Genre1Encoded', 'Genre2Encoded',
       'Genre3Encoded', 'VotesEncoded', 'DirectorEncoded', 'Actor1Encoded',
       'Actor2Encoded'],
      dtype='object')

Manupalating the Size of Dataframe

In [42]:
newdf["DirectorEncoded"] = newdf["DirectorEncoded"].astype('int32')
newdf["Actor1Encoded"] = newdf["Actor1Encoded"].astype('int32')
newdf["Actor2Encoded"] = newdf["Actor2Encoded"].astype('int32')
newdf["Genre1Encoded"] = newdf["Genre1Encoded"].astype('int32')
newdf["Genre2Encoded"] = newdf["Genre2Encoded"].astype('int32')
newdf["Genre3Encoded"] = newdf["Genre3Encoded"].astype('int32')
newdf["VotesEncoded"] = newdf["VotesEncoded"].astype('int32')
newdf['Actor1Count'] = newdf['Actor1Count'].astype('int32')
newdf['DirectorCount'] = newdf['DirectorCount'].astype('int32')
newdf['MovieAge'] = newdf['MovieAge'].astype('int32')
newdf['LoggedVotes'] = newdf['LoggedVotes'].astype('int32')


In [43]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11086 entries, 0 to 11085
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             11086 non-null  int64  
 1   Duration         11086 non-null  int64  
 2   Genre            11086 non-null  object 
 3   Rating           11086 non-null  float64
 4   Votes            11086 non-null  int64  
 5   Director         11086 non-null  object 
 6   Actor 1          11086 non-null  object 
 7   Actor 2          11086 non-null  object 
 8   Genre1           11086 non-null  int64  
 9   Genre2           11086 non-null  int64  
 10  Genre3           11086 non-null  int64  
 11  MovieAge         11086 non-null  int32  
 12  DirectorCount    11086 non-null  int32  
 13  Actor1Count      11086 non-null  int32  
 14  Actor2Count      11086 non-null  int64  
 15  LoggedVotes      11086 non-null  int32  
 16  DirectorRating   11086 non-null  float64
 17  Actor1Rating

In [44]:
newdf.drop(columns= ['Genre','Director','Actor 1','Actor 2'],inplace=True)

In [45]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11086 entries, 0 to 11085
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             11086 non-null  int64  
 1   Duration         11086 non-null  int64  
 2   Rating           11086 non-null  float64
 3   Votes            11086 non-null  int64  
 4   Genre1           11086 non-null  int64  
 5   Genre2           11086 non-null  int64  
 6   Genre3           11086 non-null  int64  
 7   MovieAge         11086 non-null  int32  
 8   DirectorCount    11086 non-null  int32  
 9   Actor1Count      11086 non-null  int32  
 10  Actor2Count      11086 non-null  int64  
 11  LoggedVotes      11086 non-null  int32  
 12  DirectorRating   11086 non-null  float64
 13  Actor1Rating     11086 non-null  float64
 14  Actor2Rating     11086 non-null  float64
 15  Genre1Encoded    11086 non-null  int32  
 16  Genre2Encoded    11086 non-null  int32  
 17  Genre3Encode

In [46]:
newdf.isnull().sum()

Year               0
Duration           0
Rating             0
Votes              0
Genre1             0
Genre2             0
Genre3             0
MovieAge           0
DirectorCount      0
Actor1Count        0
Actor2Count        0
LoggedVotes        0
DirectorRating     0
Actor1Rating       0
Actor2Rating       0
Genre1Encoded      0
Genre2Encoded      0
Genre3Encoded      0
VotesEncoded       0
DirectorEncoded    0
Actor1Encoded      0
Actor2Encoded      0
dtype: int64

Using Linear regression for the ML model

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

70% Train 30% test split

In [48]:
X=newdf.drop("Rating",axis=1)
Y=newdf["Rating"]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=52)

model=LinearRegression()
model.fit(X_train,Y_train)

X_test_prediction= model.predict(X_test)

mse =mean_squared_error(Y_test,X_test_prediction)
print(f"Mean Squared Error (MSE): {mse:.2f}")

r2 = r2_score(Y_test,X_test_prediction)
print(f"R-squared score: {r2:.2f}")

Mean Squared Error (MSE): 0.32
R-squared score: 0.76
