# Task 2: Movie Rating Prediction with python

# Import necessary libraries

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Loading the dataset

In [2]:
movie_data = pd.read_csv("IMDb Movies India.csv" ,encoding="latin-1")

In [3]:
movie_data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),-2019.0,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,-2021.0,90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,-2019.0,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,-2010.0,105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [4]:
movie_data.shape

(15509, 10)

In [5]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15508 non-null  object 
 1   Year      14981 non-null  float64
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(2), object(8)
memory usage: 1.2+ MB


In [7]:
movie_data.describe()

Unnamed: 0,Year,Rating
count,14981.0,7919.0
mean,-1987.012215,5.841621
std,25.416689,1.381777
min,-2022.0,1.1
25%,-2009.0,4.9
50%,-1991.0,6.0
75%,-1968.0,6.8
max,-1913.0,10.0


In [8]:
movie_data.isnull().sum()

Name           1
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [9]:
movie_data.dropna(inplace=True)

In [10]:
movie_data.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

In [11]:
# Dropping rows with missing values for simplicity
movie_data.duplicated().sum()

0

In [12]:
# Convert 'Duration' to string type, then extract the numeric part
movie_data['Duration'] = movie_data['Duration'].astype(str).str.extract(r'(\d+)').astype(int)

In [13]:
# Remove commas from numeric columns
numeric_columns_with_commas = ['Votes']  # Add other columns if necessary
for col in numeric_columns_with_commas:
    if col in movie_data.columns:
        movie_data[col] = movie_data[col].str.replace(',', '').astype(float)

In [14]:
# Ensure all categorical columns are converted to strings if they exist
categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
actual_columns = movie_data.columns

In [15]:
print("Actual columns in the dataset:", actual_columns)

Actual columns in the dataset: Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')


In [16]:
# Identify and handle missing columns
missing_columns = [col for col in categorical_columns if col not in actual_columns]
if missing_columns:
    print("Missing columns:", missing_columns)
    # Remove missing columns from categorical_columns
    categorical_columns = [col for col in categorical_columns if col in actual_columns]

In [17]:
# Convert existing categorical variables to numerical representations using one-hot encoding
movie_data = pd.get_dummies(movie_data, columns=categorical_columns)

In [18]:
# Define features (X) and target variable (y)
X = movie_data.drop(['Rating', 'Name'], axis=1)
y = movie_data['Rating']

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Regression model

In [20]:
# Train the Linear Regression model
lr_regressor = LinearRegression()
lr_regressor.fit(X_train, y_train)

In [21]:
# Predictions
y_pred = lr_regressor.predict(X_test)

# Evaluation of the model

In [22]:
# Evaluation of the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 9.16228896472854


In [23]:
# Display some predictions along with actual values
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results.head()

Unnamed: 0,Actual,Predicted
10971,6.0,4.62374
14052,2.4,4.749843
10002,3.8,1.72925
3970,3.8,6.24525
8840,7.2,5.338209


# Result

In [24]:
results.to_csv("submission.csv",index=False)