# Importing Necessary Libraries

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

# Loading Data

In [34]:
try:
    df = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')
except UnicodeDecodeError:
    df = pd.read_csv('IMDb Movies India.csv', encoding='latin1')
print(df)

                                     Name    Year Duration            Genre  \
0                                             NaN      NaN            Drama   
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                             #Homecoming  (2021)   90 min   Drama, Musical   
3                                 #Yaaram  (2019)  110 min  Comedy, Romance   
4                       ...And Once Again  (2010)  105 min            Drama   
...                                   ...     ...      ...              ...   
15504                 Zulm Ko Jala Doonga  (1988)      NaN           Action   
15505                               Zulmi  (1999)  129 min    Action, Drama   
15506                           Zulmi Raj  (2005)      NaN           Action   
15507                       Zulmi Shikari  (1988)      NaN           Action   
15508                        Zulm-O-Sitam  (1998)  130 min    Action, Drama   

       Rating Votes            Director           A

In [35]:
target_column = 'Rating'  # Replace with the actual name if different

if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in the dataframe.")

# Drop any columns not useful for prediction
columns_to_drop = ['Name', target_column]
columns_to_drop = [col for col in columns_to_drop if col in df.columns]



In [36]:
# Define features and target variable
X = df.drop(columns=columns_to_drop)
y = df[target_column]

y = y.fillna(y.mean())



# Linear Regression

In [37]:
# Identify categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['number']).columns.tolist()
# Define preprocessing steps for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),('num', numerical_transformer, numerical_features)
    ] 
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])




In [38]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Train the model

In [39]:

model.fit(X_train, y_train)

# Prediction

In [40]:
X_subset = X_test.head()

predictions = model.predict(X_subset)

subset_df = df.iloc[X_subset.index]  

results = pd.DataFrame({'Movie Name': subset_df['Name'], 'Actual Rating': y_test.head(len(X_subset)), 'Predicted Rating': predictions})
print(results)

          Movie Name  Actual Rating  Predicted Rating
11115     Puchki Das       5.841621          2.057283
2921         Chauhar       6.800000          6.652345
3463          Darpan       5.841621          5.785822
2495          Bombay       5.841621          6.822384
15263  Yeh Laal Rang       5.841621          5.794439


In [55]:
y_pred = model.predict(X_test)

In [56]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Calculate R-squared and mean Squared Error

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 2.4691437930397617
R^2 Score: -0.3281070633282588


# Random Forest

In [43]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
df = df.dropna(subset=['Rating'])

In [44]:
# Identify and encode categorical features
categorical_features = df.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_features:
    if col != 'Name':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

In [45]:
# Separate features and target
X = df.drop(columns=['Name', 'Rating'])
y = df['Rating']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model

In [47]:
rf_model.fit(X_train, y_train)

# predictions

In [48]:
y_pred_rf = rf_model.predict(X_test)

In [49]:
example_indices = X_test.sample(5, random_state=42).index
example_X = X_test.loc[example_indices]
example_y_pred = rf_model.predict(example_X)
example_movies = df.loc[example_indices]
predictions_df = pd.DataFrame({
    'Movie Name': example_movies['Name'],
    'Predicted Rating': example_y_pred
})
print(predictions_df)

                      Movie Name  Predicted Rating
877                   Aloo Chaat             4.892
2301   Bhopal: A Prayer for Rain             6.298
8533                       Mafia             4.903
11021                Prem Kahani             6.415
10471              Painting Life             6.756


In [50]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error (MSE): {mse_rf}")

r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest R-squared: {r2_rf}")

Random Forest Mean Squared Error (MSE): 1.4440161912878786
Random Forest R-squared: 0.22329023177350515


# Efficiency comparison of both algorithm

In [57]:
from sklearn.metrics import r2_score
linear_regression_efficiency = r2* 100
random_forest_efficiency = r2_rf * 100

# Print efficiencies
print(f"Linear Regression Model Efficiency: {linear_regression_efficiency:.2f}%")
print(f"Random Forest Model Efficiency: {random_forest_efficiency:.2f}%")

Linear Regression Model Efficiency: -32.81%
Random Forest Model Efficiency: 22.33%


In [59]:
mse_lr = mean_squared_error(y_test, y_pred)
r2_lr = r2_score(y_test, y_pred)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Linear Regression Model - Mean Squared Error: {mse_lr:.2f}, R-squared: {r2_lr:.2f}")
print(f"Random Forest Model - Mean Squared Error: {mse_rf:.2f}, R-squared: {r2_rf:.2f}")

Linear Regression Model - Mean Squared Error: 2.47, R-squared: -0.33
Random Forest Model - Mean Squared Error: 1.44, R-squared: 0.22
