Importing the libraries

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

Import the dataset

In [39]:
import chardet



with open('/content/drive/MyDrive/CODESOFT Data Sets/IMDb Movies India.csv', 'rb') as f:
    raw_data = f.read()
    result = chardet.detect(raw_data) #detecting the encoding

print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.7299889142069536, 'language': ''}


Read the decoded file using pandas

In [12]:
import pandas as pd

file_path = '/content/drive/MyDrive/CODESOFT Data Sets/IMDb Movies India.csv'

# Read the CSV file using the detected encoding
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the DataFrame
df


Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


EDA and Preprocessing

In [13]:
df.shape

(15509, 10)

In [20]:
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [21]:
df.isna().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [25]:
# Handle missing values
df = df.dropna(subset=['Rating'])  # Drop rows where the target variable is missing
df = df.fillna('Votes')
df.isna().sum() #checking for null values

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [28]:
df.shape

(7919, 10)

In [42]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,...Aur Pyaar Ho Gaya,(1997),147 min,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,...Yahaan,(2005),142 min,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,?: A Question Mark,(2012),82 min,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
...,...,...,...,...,...,...,...,...,...,...
15501,Zulm Ki Hukumat,(1992),Votes,"Action, Crime, Drama",5.3,135,Bharat Rangachary,Dharmendra,Moushumi Chatterjee,Govinda
15503,Zulm Ki Zanjeer,(1989),125 min,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
15504,Zulm Ko Jala Doonga,(1988),Votes,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani


We can see from the above expression that the dataset has been cleaned and reducued in size

In [31]:
# Select relevant features for the model
X = df[['Genre', 'Director' , 'Votes']]
y = df['Rating']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Preprocess the data (OneHotEncoding for categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical,', OneHotEncoder(handle_unknown='ignore'), ['Genre', 'Director', 'Votes'])
    ]
)

OneHotEncoder(handle_unknown='ignore'): This transformer converts categorical variables into a form that could be provided to machine learning algorithms to do a better job in prediction. It creates new binary columns (one for each unique category in the original column) and handles unknown categories encountered during transformation by ignoring them.

In [34]:
preprocessor

In [35]:
#Create a pipeline with preprocessing and regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [36]:
pipeline

Pipeline: This allows sequentially applying a list of transforms and a final estimator. Intermediate steps must be transformers (like the preprocessor), and the final step is an estimator (like LinearRegression).

In [37]:
# Train the model
pipeline.fit(X_train, y_train)

pipeline.fit(X_train, y_train): This method fits the entire pipeline to the training data. It first applies the preprocessing steps (one-hot encoding in this case) and then fits the regressor (LinearRegression) to the transformed data.

In [38]:
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 2.1907810276694057
R^2 Score: -0.1783808481527447
