In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
try:
    df = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')
except UnicodeDecodeError:
    df = pd.read_csv('IMDb Movies India.csv', encoding='latin1')
print(df)

                                     Name    Year Duration            Genre  \
0                                             NaN      NaN            Drama   
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                             #Homecoming  (2021)   90 min   Drama, Musical   
3                                 #Yaaram  (2019)  110 min  Comedy, Romance   
4                       ...And Once Again  (2010)  105 min            Drama   
...                                   ...     ...      ...              ...   
15504                 Zulm Ko Jala Doonga  (1988)      NaN           Action   
15505                               Zulmi  (1999)  129 min    Action, Drama   
15506                           Zulmi Raj  (2005)      NaN           Action   
15507                       Zulmi Shikari  (1988)      NaN           Action   
15508                        Zulm-O-Sitam  (1998)  130 min    Action, Drama   

       Rating Votes            Director           A

In [4]:
columns_to_encode = ['genre', 'director', 'actors','rating']

# Verify if columns exist in the dataframe
columns_to_encode = [col for col in columns_to_encode if col in df.columns]

# One-hot encode categorical features
df = pd.get_dummies(df, columns=columns_to_encode)
print(df.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [15]:
columns_to_drop = ['title', 'Rating']

# Verify columns before dropping
columns_to_drop = [col for col in columns_to_drop if col in df.columns]

# Define features and target variable
X = df.drop(columns=columns_to_drop)
y = df['Rating'] if 'Rating' in df.columns else None

if y is None:
    raise ValueError("Target column 'user_rating' not found in the dataframe.")

# Print the first few rows of X and y to verify
print(X.head())
print(y.head())

                                 Name    Year Duration            Genre Votes  \
0                                         NaN      NaN            Drama   NaN   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama     8   
2                         #Homecoming  (2021)   90 min   Drama, Musical   NaN   
3                             #Yaaram  (2019)  110 min  Comedy, Romance    35   
4                   ...And Once Again  (2010)  105 min            Drama   NaN   

             Director       Actor 1             Actor 2          Actor 3  
0       J.S. Randhawa      Manmauji              Birbal  Rajendra Bhatia  
1       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande    Arvind Jangid  
2  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur       Roy Angana  
3          Ovais Khan       Prateik          Ishita Raj  Siddhant Kapoor  
4        Amol Palekar  Rajat Kapoor  Rituparna Sengupta      Antara Mali  
0    NaN
1    7.0
2    NaN
3    4.4
4    NaN
Name: Rating, dtyp

# Logical Regression

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
target_column = 'Rating'  # Replace with the actual name if different

if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in the dataframe.")

# Drop any columns not useful for prediction
columns_to_drop = ['Name', target_column]
columns_to_drop = [col for col in columns_to_drop if col in df.columns]

# Define features and target variable
X = df.drop(columns=columns_to_drop)
y = df[target_column]

y = y.fillna(y.mean())

# Identify categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['number']).columns.tolist()
# Define preprocessing steps for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),('num', numerical_transformer, numerical_features)
    ] # Keep other columns as is
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [5]:
X = df.drop(columns=['Name', 'Rating'])
y = df['Rating']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
y_pred = model.predict(X_test)

In [7]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Calculate R-squared and mean Squared Error

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

ValueError: Input contains NaN.

The r2 score is more robust and quite often used accuracy matrix.
It measures the average squared difference between the predicted and the actual target values within a dataset.

In [8]:
X_subset = X_test.head()

# Make predictions using the trained model
predictions = model.predict(X_subset)

# Display the predictions alongside the actual values (y_test) for comparison
results = pd.DataFrame({'Actual Rating': y_test.head(len(X_subset)), 'Predicted Rating': predictions})
print(results)

       Actual Rating  Predicted Rating
11115            NaN          2.057283
2921             6.8          6.652345
3463             NaN          5.785822
2495             NaN          6.822384
15263            NaN          5.794439


In [12]:
X_subset = X_test.head()

# Make predictions using the trained model
predictions = model.predict(X_subset)

# Display the predictions alongside the actual values (y_test) for comparison
results = pd.DataFrame({'Actual Rating': y_test.head(len(X_subset)), 'Predicted Rating': predictions})
print(results)

       Actual Rating  Predicted Rating
11115       5.841621          2.057283
2921        6.800000          6.652345
3463        5.841621          5.785822
2495        5.841621          6.822384
15263       5.841621          5.794439


In [13]:
X_subset = X_test.head()

# Make predictions using the trained model
predictions = model.predict(X_subset)

# Get the corresponding movie names and actual ratings from the original dataframe
subset_df = df.iloc[X_subset.index]  # Get subset of original dataframe based on indices of X_subset

# Display the predictions alongside the movie names and actual values for comparison
results = pd.DataFrame({'Movie Name': subset_df['Name'], 'Actual Rating': y_test.head(len(X_subset)), 'Predicted Rating': predictions})
print(results)

          Movie Name  Actual Rating  Predicted Rating
11115     Puchki Das       5.841621          2.057283
2921         Chauhar       6.800000          6.652345
3463          Darpan       5.841621          5.785822
2495          Bombay       5.841621          6.822384
15263  Yeh Laal Rang       5.841621          5.794439


# RandomForest

In [22]:
from sklearn.preprocessing import LabelEncoder
df = df.dropna(subset=['Rating'])

# Identify and encode categorical features
categorical_features = df.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_features:
    if col != 'Name':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

# Separate features and target
X = df.drop(columns=['Name', 'Rating'])
y = df['Rating']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Calculate Mean Squared Error
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error (MSE): {mse_rf}")

# Calculate R-squared
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest R-squared: {r2_rf}")


Random Forest Mean Squared Error (MSE): 1.4440161912878786
Random Forest R-squared: 0.22329023177350515


In this case, the Random Forest model has a lower MSE (1.444) compared to the Linear Regression model (1.735), which means it makes smaller average errors in its predictions. Additionally, the Random Forest model has a higher R² score (0.223) compared to the Linear Regression model (0.067), indicating it explains a higher proportion of the variance in the target variable.

In [14]:
example_indices = X_test.sample(5, random_state=42).index
example_X = X_test.loc[example_indices]
example_y_pred = rf_model.predict(example_X)
example_movies = df.loc[example_indices]

for name, pred in zip(example_movies['Name'], example_y_pred):
    print(f"Movie: {name}, Predicted Rating: {pred}")

Movie: Aloo Chaat, Predicted Rating: 4.892
Movie: Bhopal: A Prayer for Rain, Predicted Rating: 6.2979999999999965
Movie: Mafia, Predicted Rating: 4.903000000000001
Movie: Prem Kahani, Predicted Rating: 6.415
Movie: Painting Life, Predicted Rating: 6.755999999999998


In [15]:
example_indices = X_test.sample(5, random_state=42).index
example_X = X_test.loc[example_indices]
example_y_pred = rf_model.predict(example_X)
example_movies = df.loc[example_indices]

In [16]:
predictions_df = pd.DataFrame({
    'Movie Name': example_movies['Name'],
    'Predicted Rating': example_y_pred
})

In [17]:
print(predictions_df)

                      Movie Name  Predicted Rating
877                   Aloo Chaat             4.892
2301   Bhopal: A Prayer for Rain             6.298
8533                       Mafia             4.903
11021                Prem Kahani             6.415
10471              Painting Life             6.756


In [23]:
from sklearn.metrics import r2_score

# Calculate efficiencies
linear_regression_efficiency = r2_score(y_test, y_pred) * 100
random_forest_efficiency = r2_score(y_test, y_pred_rf) * 100

# Print efficiencies
print(f"Linear Regression Model Efficiency: {linear_regression_efficiency:.2f}%")
print(f"Random Forest Model Efficiency: {random_forest_efficiency:.2f}%")

Linear Regression Model Efficiency: 6.70%
Random Forest Model Efficiency: 22.33%


In [26]:
df['Title Length'] = df['Name'].apply(lambda x: len(x))

# Drop less useful features
X = df.drop(columns=['Name', 'Rating'])

In [27]:
from sklearn.preprocessing import StandardScaler

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data again
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_
y_pred_rf_best = best_rf_model.predict(X_test)

# Calculate new efficiency
rf_best_efficiency = r2_score(y_test, y_pred_rf_best) * 100
print(f"Best Random Forest Model Efficiency: {rf_best_efficiency:.2f}%")

Best Random Forest Model Efficiency: 23.73%


In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation for Random Forest
cv_scores = cross_val_score(best_rf_model, X_scaled, y, cv=10, scoring='r2')
print(f"Cross-Validated R^2: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")