In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
file_path = 'imdb_top_1000.csv'
df = pd.read_csv(file_path)

# Drop irrelevant columns
columns_to_drop = ['Poster_Link', 'Series_Title', 'Overview', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']
df = df.drop(columns=columns_to_drop)

# Handle missing values
df['Gross'] = df['Gross'].str.replace(',', '').astype(float)
df['Gross'].fillna(df['Gross'].mean(), inplace=True)
df['Meta_score'].fillna(df['Meta_score'].mean(), inplace=True)
df['Certificate'].fillna(df['Certificate'].mode()[0], inplace=True)
df['Released_Year'].fillna(df['Released_Year'].mode()[0], inplace=True)

# Clean the Released_Year column
df = df[df['Released_Year'] != 'PG']
df['Released_Year'] = df['Released_Year'].astype(int)

# Convert Runtime to numerical values
df['Runtime'] = df['Runtime'].str.replace(' min', '').astype(int)

# Encode categorical variables
le_genre = LabelEncoder()
df['Genre'] = le_genre.fit_transform(df['Genre'])

le_certificate = LabelEncoder()
df['Certificate'] = le_certificate.fit_transform(df['Certificate'])

# Define independent variables (features) and dependent variable (target)
X = df.drop(columns=['Certificate'])
y = df['Certificate']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Display the first few rows of the processed dataset
df.head()


Unnamed: 0,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Meta_score,No_of_Votes,Gross
0,1994,1,142,137,9.3,80.0,2343110,28341469.0
1,1972,1,175,122,9.2,100.0,1620367,134966411.0
2,2008,14,152,22,9.0,84.0,2303232,534858444.0
3,1974,1,202,122,9.0,90.0,1129952,57300000.0
4,1957,12,96,122,9.0,96.0,689845,4360000.0


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Dictionary to store the evaluation metrics for each model
results = {}

# Function to evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'f1_score': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
results['Logistic Regression'] = evaluate_model(log_reg, X_train, X_test, y_train, y_test)

# Decision Tree Classifier
decision_tree = DecisionTreeClassifier()
results['Decision Tree'] = evaluate_model(decision_tree, X_train, X_test, y_train, y_test)

# Random Forest Classifier
random_forest = RandomForestClassifier()
results['Random Forest'] = evaluate_model(random_forest, X_train, X_test, y_train, y_test)

# Support Vector Machine (SVM)
svm = SVC()
results['SVM'] = evaluate_model(svm, X_train, X_test, y_train, y_test)

# k-Nearest Neighbors (k-NN)
knn = KNeighborsClassifier()
results['k-NN'] = evaluate_model(knn, X_train, X_test, y_train, y_test)


print(results)


{'Logistic Regression': {'accuracy': 0.4, 'precision': 0.34099206349206346, 'recall': 0.4, 'f1_score': 0.34495743984445076}, 'Decision Tree': {'accuracy': 0.33, 'precision': 0.31945052209810587, 'recall': 0.33, 'f1_score': 0.3235187681314293}, 'Random Forest': {'accuracy': 0.44, 'precision': 0.3753005890505891, 'recall': 0.44, 'f1_score': 0.3902141743827353}, 'SVM': {'accuracy': 0.46, 'precision': 0.3992695099818512, 'recall': 0.46, 'f1_score': 0.39265850340136055}, 'k-NN': {'accuracy': 0.35, 'precision': 0.33895186335403726, 'recall': 0.35, 'f1_score': 0.337942869784975}}


In [11]:
# Print the results in a formatted manner
for model, metrics in results.items():
    print(f"Model: {model}")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")
    print("\n")

# Determine the best model based on a chosen metric (e.g., F1-score)
best_model = max(results, key=lambda x: results[x]['f1_score'])
print(f"The best model is: {best_model}")


Model: Logistic Regression
accuracy: 0.4000
precision: 0.3410
recall: 0.4000
f1_score: 0.3450


Model: Decision Tree
accuracy: 0.3300
precision: 0.3195
recall: 0.3300
f1_score: 0.3235


Model: Random Forest
accuracy: 0.4400
precision: 0.3753
recall: 0.4400
f1_score: 0.3902


Model: SVM
accuracy: 0.4600
precision: 0.3993
recall: 0.4600
f1_score: 0.3927


Model: k-NN
accuracy: 0.3500
precision: 0.3390
recall: 0.3500
f1_score: 0.3379


The best model is: SVM
