In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset with ISO-8859-1 encoding
file_path = '/content/IMDb Movies India.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Handle missing values
# Fill missing values in categorical columns with a placeholder
for column in ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    data[column].fillna('Unknown', inplace=True)

# Fill missing values in numerical columns with the mean
data['Rating'].fillna(data['Rating'].mean(), inplace=True)

# Verify that there are no more missing values
print(data.isnull().sum())

# Selecting relevant columns
features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
target = 'Rating'

# Define the column transformer with one-hot encoding for categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('genre', OneHotEncoder(handle_unknown='ignore'), ['Genre']),
        ('director', OneHotEncoder(handle_unknown='ignore'), ['Director']),
        ('actor1', OneHotEncoder(handle_unknown='ignore'), ['Actor 1']),
        ('actor2', OneHotEncoder(handle_unknown='ignore'), ['Actor 2']),
        ('actor3', OneHotEncoder(handle_unknown='ignore'), ['Actor 3'])
    ],
    remainder='drop'
)

# Create a pipeline with preprocessing and SVR
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),  # Scale features after encoding
    ('regressor', SVR(kernel='rbf', C=1.0, epsilon=0.2))
])

# Split the data into training and testing sets
X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    