<a href="https://colab.research.google.com/github/CHARISPRISCILLA/CODSOFT/blob/main/Movie_Rating_Prediction_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
# Load the dataset
data = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin1')  # Replace 'movie_dataset.csv' with your dataset file

# Check for missing values
data.isnull().sum()

# Drop rows with missing values
data.dropna(inplace=True)

# Clean the 'Year' and 'Duration' columns
data['Year'] = data['Year'].str.extract('(\d+)').astype(float)
data['Duration'] = data['Duration'].str.extract('(\d+)').astype(float)

# Remove commas from the 'Votes' column and convert to float
data['Votes'] = data['Votes'].str.replace(',', '').astype(float)

# Encode categorical variables like 'Genre', 'Director', 'Actor 1', 'Actor 2', and 'Actor 3' using label encoding

categorical_columns = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [None]:
# Split the data into features (X) and the target variable (y)
X = data[['Year', 'Duration', 'Genre', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']]
y = data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create the linear regression model
model = LinearRegression()

model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)


Mean Absolute Error: 1.0531107691729633
Mean Squared Error: 1.7279215702654251
Root Mean Squared Error: 1.3145043059136114


In [None]:
# Calculate residuals (the difference between actual and predicted ratings)
residuals = y_test - y_pred

# Create a histogram of residuals
fig = ff.create_distplot([residuals], group_labels=['Residuals'], colors=['blue'])
fig.update_layout(title="Distribution of Residuals (Errors)")
fig.show()


In [None]:
# Example of making a prediction for a new movie
new_movie = pd.DataFrame({'Year': [2023], 'Duration': [120], 'Genre': [1], 'Votes': [1000], 'Director': [2], 'Actor 1': [3], 'Actor 2': [4], 'Actor 3': [5]})
rating_prediction = model.predict(new_movie)
print("Predicted Rating:", rating_prediction[0])


Predicted Rating: 5.042667179058867


In [None]:
# Ensure 'y_test' and 'y_pred' are both of type float
y_test = y_test.astype(float)
y_pred = y_pred.astype(float)

# Create a scatter plot of actual vs. predicted ratings with different colors
fig = px.scatter(x=y_test, y=y_pred, color=y_pred - y_test, labels={'x': 'Actual Ratings', 'y': 'Predicted Ratings', 'color': 'Residuals'})
fig.update_traces(marker=dict(size=5, opacity=0.5))
fig.update_layout(title="Actual vs. Predicted Ratings Scatter Plot")
fig.show()
