<a href="https://colab.research.google.com/github/Debayan2004/CODSOFT/blob/main/MovieRatingPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!kaggle datasets download -d adrianmcmahon/imdb-india-movies

Dataset URL: https://www.kaggle.com/datasets/adrianmcmahon/imdb-india-movies
License(s): CC0-1.0
Downloading imdb-india-movies.zip to /content
  0% 0.00/494k [00:00<?, ?B/s]
100% 494k/494k [00:00<00:00, 71.6MB/s]


In [None]:
import zipfile
import os

# Path to the downloaded .zip file (example: 'titanic.zip')
zip_file_path = '/content/imdb-india-movies.zip'  # Replace with your .zip file path

# Destination folder to extract the contents
extract_to_path = '/content/MovieRating'  # Replace with your destination directory

# Ensure the extraction path exists
os.makedirs(extract_to_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

# List the extracted files to verify
extracted_files = os.listdir(extract_to_path)
print(f"Files extracted to {extract_to_path}: {extracted_files}")


Files extracted to /content/MovieRating: ['IMDb Movies India.csv']


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [None]:
# Load dataset, trying different encodings
try:
    df = pd.read_csv('/content/MovieRating/IMDb Movies India.csv', encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('/content/MovieRating/IMDb Movies India.csv', encoding='latin-1')
    except UnicodeDecodeError:
        df = pd.read_csv('/content/MovieRating/IMDb Movies India.csv', encoding='ISO-8859-1')

# Display the data in tabular format
print("Dataset:")
print(df.head())

Dataset:
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoo

In [None]:
# Handle missing values
df.fillna('', inplace=True)  # Simple imputation for missing values

# Convert the 'Rating' column to numeric and handle errors
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Fill missing values in 'Rating' with the median value
df['Rating'].fillna(df['Rating'].median(), inplace=True)

In [None]:
# Define features and target
features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
target = 'Rating'
X = df[features]
y = df[target]

In [None]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# One-hot encode categorical features
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor pipeline for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessing and the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', GradientBoostingRegressor(random_state=42))])


In [None]:
# Train the model
model.fit(X_train, y_train)
# Save the trained model
joblib.dump(model, 'movie_rating_prediction_model.pkl')

['movie_rating_prediction_model.pkl']

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")














Mean Absolute Error: 0.5812499219174528
Mean Squared Error: 0.8959524638400524
R^2 Score: 0.07862661133861626


In [None]:
# Function to categorize rating
def categorize_rating(rating):
    if rating < 4:
        return 'low'
    elif rating < 7:
        return 'medium'
    else:
        return 'high'

In [None]:
# Load the saved model
model = joblib.load('/content/movie_rating_prediction_model.pkl')

In [None]:
# Prepare input data for testing
input_data = {
    'Genre': ['Action'],
    'Director': ['James Cameron'],
    'Actor 1': ['Arnold Schwarzenegger'],
    'Actor 2': ['Linda Hamilton'],
    'Actor 3': ['Michael Biehn']
}

In [None]:
# Create a DataFrame
input_df = pd.DataFrame(input_data)


In [None]:
# Make predictions
predicted_rating = model.predict(input_df)

# Categorize the predicted rating
predicted_rating_category = categorize_rating(predicted_rating[0])

print(f"Predicted Rating (Numeric): {predicted_rating[0]}")
print(f"Predicted Rating Category: {predicted_rating_category}")

Predicted Rating (Numeric): 5.8365363100365615
Predicted Rating Category: medium
