MOVIE RATING PREDICTION WITH PYTHON


Build a model that predicts the rating of a movie based on
features like genre, director, and actors. You can use regression
techniques to tackle this problem.
The goal is to analyze historical movie data and develop a model
that accurately estimates the rating given to a movie by users or
critics.
Movie Rating Prediction project enables you to explore data
analysis, preprocessing, feature engineering, and machine
learning modeling techniques. It provides insights into the factors
that influence movie ratings and allows you to build a model that
can estimate the ratings of movies accurately.

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer


In [10]:
import pandas as pd
import numpy as np  # Make sure NumPy is imported
import chardet

# Detect the encoding
with open(r"C:\Users\acer\Desktop\code soft\IMDb Movies India.csv", "rb") as f:
    rawdata = f.read()
result = chardet.detect(rawdata)
encoding = result['encoding']

# Read the file using the detected encoding
movie_data = pd.read_csv(r"C:\Users\acer\Desktop\code soft\IMDb Movies India.csv", encoding=encoding)

# Print the DataFrame
print(movie_data)

# Detect null values
print(movie_data.isnull().sum())

# Drop rows with null values
movie_data.dropna(inplace=True)

# Convert categorical columns to dummy variables
movie_data = pd.get_dummies(movie_data, columns=['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'])

# Ensure all data used for correlation is numeric
# Select only numeric columns
movie_data_numeric = movie_data.select_dtypes(include=[np.number])

# Debugging step: print out columns that are non-numeric
non_numeric_columns = movie_data.select_dtypes(exclude=[np.number]).columns
if not non_numeric_columns.empty:
    print("Non-numeric columns detected:", non_numeric_columns)
    # Print out unique values in these columns to understand the problem
    for col in non_numeric_columns:
        print(f"Unique values in {col}: {movie_data[col].unique()}")

# Compute the correlation matrix only on numeric columns
correlation_matrix = movie_data_numeric.corr()

# Print the correlation coefficients of the 'rating' column
if 'rating' in correlation_matrix.columns:
    print(correlation_matrix['rating'].sort_values(ascending=False))
else:
    print("The 'rating' column is not present in the numeric data.")
    


                                     Name    Year Duration            Genre  \
0                                             NaN      NaN            Drama   
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                             #Homecoming  (2021)   90 min   Drama, Musical   
3                                 #Yaaram  (2019)  110 min  Comedy, Romance   
4                       ...And Once Again  (2010)  105 min            Drama   
...                                   ...     ...      ...              ...   
15504                 Zulm Ko Jala Doonga  (1988)      NaN           Action   
15505                               Zulmi  (1999)  129 min    Action, Drama   
15506                           Zulmi Raj  (2005)      NaN           Action   
15507                       Zulmi Shikari  (1988)      NaN           Action   
15508                        Zulm-O-Sitam  (1998)  130 min    Action, Drama   

       Rating Votes            Director           A

Splitting Data:

In [21]:
from sklearn.model_selection import train_test_split
X = movie_data.drop('Rating', axis=1)
y = movie_data['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X)
print(y)


                                     Name    Year Duration  Votes  \
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min      8   
3                                 #Yaaram  (2019)  110 min     35   
5                    ...Aur Pyaar Ho Gaya  (1997)  147 min    827   
6                               ...Yahaan  (2005)  142 min  1,086   
8                      ?: A Question Mark  (2012)   82 min    326   
...                                   ...     ...      ...    ...   
15493                              Zubaan  (2015)  115 min    408   
15494                            Zubeidaa  (2001)  153 min  1,496   
15503                     Zulm Ki Zanjeer  (1989)  125 min     44   
15505                               Zulmi  (1999)  129 min    655   
15508                        Zulm-O-Sitam  (1998)  130 min     20   

       Genre_Action  Genre_Action, Adventure  \
1             False                    False   
3             False                    False   
5             False        

Scaling Features:

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Sample dataset
movie_data = pd.DataFrame({
    'genre': ['Action', 'Comedy', 'Drama', 'Action', 'Comedy'],
    'director': ['John', 'Jane', 'Jack', 'John', 'Jane'],
    'actors': ['A', 'B', 'C', 'A', 'B'],
    'budget': [100, 150, 200, 120, 130],
    'rating': [7.5, 6.0, 8.0, 7.0, 6.5]
})

X = movie_data.drop('rating', axis=1)
y = movie_data['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = ['genre', 'director', 'actors']

# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform categorical columns
X_train_encoded = ohe.fit_transform(X_train[categorical_cols])
X_test_encoded = ohe.transform(X_test[categorical_cols])

# Convert to DataFrame and set column names
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=ohe.get_feature_names_out(categorical_cols))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=ohe.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded columns
X_train = X_train.drop(categorical_cols, axis=1)
X_test = X_test.drop(categorical_cols, axis=1)
X_train = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)

# Print transformed features before scaling
print("X_train before scaling:\n", X_train)
print("X_test before scaling:\n", X_test)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print scaled features
print("X_train after scaling:\n", X_train_scaled)
print("X_test after scaling:\n", X_test_scaled)


X_train before scaling:
    budget  genre_Action  genre_Comedy  genre_Drama  director_Jack  \
0     130           0.0           1.0          0.0            0.0   
1     200           0.0           0.0          1.0            1.0   
2     100           1.0           0.0          0.0            0.0   
3     120           1.0           0.0          0.0            0.0   

   director_Jane  director_John  actors_A  actors_B  actors_C  
0            1.0            0.0       0.0       1.0       0.0  
1            0.0            0.0       0.0       0.0       1.0  
2            0.0            1.0       1.0       0.0       0.0  
3            0.0            1.0       1.0       0.0       0.0  
X_test before scaling:
    budget  genre_Action  genre_Comedy  genre_Drama  director_Jack  \
0     150           0.0           1.0          0.0            0.0   

   director_Jane  director_John  actors_A  actors_B  actors_C  
0            1.0            0.0       0.0       1.0       0.0  
X_train after scal

Model Building


In [22]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)


Model Evaluation


In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R-squared: {r2_score(y_test, y_pred)}")


MAE: 3.552713678800501e-15
MSE: 1.262177448353619e-29
R-squared: nan




Model Improvement

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression

# Sample dataset
movie_data = pd.DataFrame({
    'Genre': ['Action', 'Comedy', 'Drama', 'Action', 'Comedy'],
    'Director': ['John', 'Jane', 'Jack', 'John', 'Jane'],
    'Actors': ['A', 'B', 'C', 'A', 'B'],
    'Budget': [100, 150, 200, 120, 130],
    'Rating': [7.5, 6.0, 8.0, 7.0, 6.5]
})

X = movie_data.drop('Rating', axis=1)
y = movie_data['Rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = ['Genre', 'Director', 'Actors']

# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform categorical columns
X_train_encoded = ohe.fit_transform(X_train[categorical_cols])
X_test_encoded = ohe.transform(X_test[categorical_cols])

# Convert to DataFrame and set column names
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=ohe.get_feature_names_out(categorical_cols))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=ohe.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded columns
X_train = X_train.drop(categorical_cols, axis=1)
X_test = X_test.drop(categorical_cols, axis=1)
X_train = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize LinearRegression
lr = LinearRegression()

# Define parameter grid
param_grid = {'fit_intercept': [True, False]}

# Initialize GridSearchCV with cv=2
grid_search = GridSearchCV(lr, param_grid, cv=2)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Print the best model parameters
print("Best model parameters:", grid_search.best_params_)

# Evaluate the best model
print("Training score:", best_model.score(X_train_scaled, y_train))
print("Test score:", best_model.score(X_test_scaled, y_test))


Best model parameters: {'fit_intercept': True}
Training score: 1.0
Test score: nan




In [40]:
from sklearn.model_selection import cross_val_score

# Use cross_val_score with cv=2
scores = cross_val_score(best_model, X_train_scaled, y_train, cv=2)
print(f"Cross-Validation Scores: {scores}")
print(f"Average CV Score: {scores.mean()}")


Cross-Validation Scores: [-7.05555556 -0.38487564]
Average CV Score: -3.720215596195305


 Final Model and Prediction

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression

# Sample dataset
movie_data = pd.DataFrame({
    'Genre': ['Action', 'Comedy', 'Drama', 'Action', 'Comedy'],
    'Director': ['John', 'Jane', 'Jack', 'John', 'Jane'],
    'Actors': ['A', 'B', 'C', 'A', 'B'],
    'Budget': [100, 150, 200, 120, 130],
    'Rating': [7.5, 6.0, 8.0, 7.0, 6.5]
})

X = movie_data.drop('Rating', axis=1)
y = movie_data['Rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = ['Genre', 'Director', 'Actors']

# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform categorical columns for training data
X_train_encoded = ohe.fit_transform(X_train[categorical_cols])
X_test_encoded = ohe.transform(X_test[categorical_cols])

# Convert to DataFrame and set column names
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=ohe.get_feature_names_out(categorical_cols))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=ohe.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded columns
X_train = X_train.drop(categorical_cols, axis=1)
X_test = X_test.drop(categorical_cols, axis=1)
X_train = pd.concat([X_train.reset_index(drop=True), X_train_encoded_df.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_encoded_df.reset_index(drop=True)], axis=1)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize LinearRegression
lr = LinearRegression()

# Define parameter grid
param_grid = {'fit_intercept': [True, False]}

# Initialize GridSearchCV
grid_search = GridSearchCV(lr, param_grid, cv=2)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Print the best model parameters
print("Best model parameters:", grid_search.best_params_)

# Evaluate the best model
print("Training score:", best_model.score(X_train_scaled, y_train))
print("Test score:", best_model.score(X_test_scaled, y_test))

# Preprocess the entire dataset
X_encoded = ohe.transform(X[categorical_cols])
X_encoded_df = pd.DataFrame(X_encoded, columns=ohe.get_feature_names_out(categorical_cols))
X_processed = X.drop(categorical_cols, axis=1)
X_processed = pd.concat([X_processed.reset_index(drop=True), X_encoded_df.reset_index(drop=True)], axis=1)
X_scaled = scaler.transform(X_processed)

# Fit the final model on the entire dataset
final_model = best_model.fit(X_scaled, y)

# Evaluate the final model
print("Final model score:", final_model.score(X_scaled, y))


Best model parameters: {'fit_intercept': True}
Training score: 1.0
Test score: nan
Final model score: 1.0




Make Predictions:

In [78]:
import pandas as pd

# Correct file path with raw string
file_path = r'C:\Users\acer\Desktop\code soft\IMDb Movies India.csv'








# Load new data with specified encoding
new_data = pd.read_csv(file_path, encoding='latin1')

# Ensure the same preprocessing steps are applied
categorical_cols = ['Genre', 'Director',"Actors"]

# Encode categorical features
new_data_encoded = ohe.transform(new_data[categorical_cols])

# Convert to DataFrame and set column names
new_data_encoded_df = pd.DataFrame(new_data_encoded, columns=ohe.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate encoded columns
new_data_processed = new_data.drop(categorical_cols, axis=1)
new_data_processed = pd.concat([new_data_processed.reset_index(drop=True), new_data_encoded_df.reset_index(drop=True)], axis=1)

# Scale the features
new_data_scaled = scaler.transform(new_data_processed)

# Predict using the final model
predictions = final_model.predict(new_data_scaled)

# Display predictions
print(predictions)






KeyError: "['Actors'] not in index"