In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
file_path = r"C:\Users\abeer\Downloads\New folder\imdb-movies-dataset.csv"
df = pd.read_csv(file_path)

# List of columns to drop
columns_to_drop = ['Poster_Link', 'Series_Title', 'Overview', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']

# Check which columns are present in the DataFrame
columns_to_drop_existing = [column for column in columns_to_drop if column in df.columns]

# Drop the existing columns
df = df.drop(columns=columns_to_drop_existing)

# Handle missing values and column transformations if they exist
if 'Gross' in df.columns:
    df['Gross'] = df['Gross'].str.replace(',', '').astype(float)
    df['Gross'].fillna(df['Gross'].mean(), inplace=True)

if 'Meta_score' in df.columns:
    df['Meta_score'].fillna(df['Meta_score'].mean(), inplace=True)

if 'Certificate' in df.columns:
    df['Certificate'].fillna(df['Certificate'].mode()[0], inplace=True)

if 'Released_Year' in df.columns:
    df['Released_Year'].fillna(df['Released_Year'].mode()[0], inplace=True)
    df = df[df['Released_Year'] != 'PG']
    df['Released_Year'] = df['Released_Year'].astype(int)

if 'Runtime' in df.columns:
    df['Runtime'] = df['Runtime'].str.replace(' min', '').astype(int)

# Encode categorical variables
if 'Genre' in df.columns:
    le_genre = LabelEncoder()
    df['Genre'] = le_genre.fit_transform(df['Genre'])

if 'Certificate' in df.columns:
    le_certificate = LabelEncoder()
    df['Certificate'] = le_certificate.fit_transform(df['Certificate'])

# Ensure all columns are numeric before scaling
for column in df.columns:
    if df[column].dtype == 'object':
        try:
            df[column] = df[column].astype(float)
        except ValueError:
            print(f"Column {column} cannot be converted to float and will be dropped.")
            df = df.drop(columns=[column])

# Define independent variables (features) and dependent variable (target)
if 'Certificate' in df.columns:
    X = df.drop(columns=['Certificate'])
    y = df['Certificate']
else:
    X = df.copy()
    y = pd.Series([])

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
if not y.empty:
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display the first few rows of the processed dataset
print(df.head())


Column Poster cannot be converted to float and will be dropped.
Column Title cannot be converted to float and will be dropped.
Column Cast cannot be converted to float and will be dropped.
Column Votes cannot be converted to float and will be dropped.
Column Description cannot be converted to float and will be dropped.
Column Review Count cannot be converted to float and will be dropped.
Column Review Title cannot be converted to float and will be dropped.
Column Review cannot be converted to float and will be dropped.
     Year  Certificate  Duration (min)  Genre  Rating  Metascore
0  2023.0           20           115.0    228     6.4       67.0
1  2023.0           19           145.0     12     7.3       66.0
2  2023.0           19            97.0    188     5.5       42.0
3  2023.0           19           126.0     22     7.3       73.0
4  2023.0           20           131.0    400     7.7       82.0


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Dictionary to store the evaluation metrics for each model
results = {}

# Function to evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
results['Logistic Regression'] = evaluate_model(log_reg, X_train, X_test, y_train, y_test)

# Decision Tree Classifier
decision_tree = DecisionTreeClassifier()
results['Decision Tree'] = evaluate_model(decision_tree, X_train, X_test, y_train, y_test)

# Random Forest Classifier
random_forest = RandomForestClassifier()
results['Random Forest'] = evaluate_model(random_forest, X_train, X_test, y_train, y_test)

# Support Vector Machine (SVM)
svm = SVC()
results['SVM'] = evaluate_model(svm, X_train, X_test, y_train, y_test)

# k-Nearest Neighbors (k-NN)
knn = KNeighborsClassifier()
results['k-NN'] = evaluate_model(knn, X_train, X_test, y_train, y_test)


print(results)


{'Logistic Regression': {'accuracy': 0.4185, 'precision': 0.25385286359013476, 'recall': 0.4185, 'f1_score': 0.27842423333403976}, 'Decision Tree': {'accuracy': 0.349, 'precision': 0.35883740489126603, 'recall': 0.349, 'f1_score': 0.3534543521881143}, 'Random Forest': {'accuracy': 0.4715, 'precision': 0.40703953897192147, 'recall': 0.4715, 'f1_score': 0.41702815671181315}, 'SVM': {'accuracy': 0.433, 'precision': 0.37793348348410744, 'recall': 0.433, 'f1_score': 0.297340528107486}, 'k-NN': {'accuracy': 0.3985, 'precision': 0.35580499151417294, 'recall': 0.3985, 'f1_score': 0.3648125310252858}}


In [12]:
# Print the results in a formatted manner
for model, metrics in results.items():
    print(f"Model: {model}")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")
    print("\n")

# Determine the best model based on a chosen metric (e.g., F1-score)
best_model = max(results, key=lambda x: results[x]['f1_score'])
print(f"The best model is: {best_model}")


Model: Logistic Regression
accuracy: 0.4185
precision: 0.2539
recall: 0.4185
f1_score: 0.2784


Model: Decision Tree
accuracy: 0.3490
precision: 0.3588
recall: 0.3490
f1_score: 0.3535


Model: Random Forest
accuracy: 0.4715
precision: 0.4070
recall: 0.4715
f1_score: 0.4170


Model: SVM
accuracy: 0.4330
precision: 0.3779
recall: 0.4330
f1_score: 0.2973


Model: k-NN
accuracy: 0.3985
precision: 0.3558
recall: 0.3985
f1_score: 0.3648


The best model is: Random Forest
