In [5]:
# Classification Project: Decision Trees, Naive Bayes, and SVM
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

Matplotlib is building the font cache; this may take a moment.


# Classification Analysis

This notebook covers classification using Decision Trees (ID3, CART), Naive Bayes, and Support Vector Machine (SVM). Model evaluation and comparison are included for each method.

## 1. Data Loading, Preprocessing, and Exploration

We will load the dataset, preprocess it to optimize classification performance, and visualize key aspects. Preprocessing includes handling missing values, encoding categorical variables, and scaling features. These steps are chosen to ensure models receive clean, numerical, and standardized data, which improves accuracy and comparability.

In [None]:
# Load, preprocess, and explore the dataset
# Read the data
df = pd.read_csv('./data/movies_dataset.csv')

# Drop rows with missing target (assuming 'target' is the column to predict)
df = df.dropna(subset=['target'])

# Fill missing values in features with median (numerical) or mode (categorical)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature scaling (StandardScaler)
scaler = StandardScaler()
X = df.drop('target', axis=1)
y = df['target']
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Visualize target distribution and feature correlations
plt.figure(figsize=(6,4))
sns.countplot(x=y)
plt.title('Target Variable Distribution')
plt.show()

plt.figure(figsize=(10,8))
sns.heatmap(pd.DataFrame(X_scaled, columns=X.columns).corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

# Show basic info
df.info()
df.describe(include='all')

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 3-4: truncated \UXXXXXXXX escape (4174193904.py, line 1)

## 3. Decision Trees (ID3 & CART)

We will train and evaluate Decision Tree classifiers using both ID3 (entropy) and CART (gini) criteria.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ID3 (criterion='entropy')
dt_id3 = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_id3.fit(X_train, y_train)
y_pred_id3 = dt_id3.predict(X_test)

print('ID3 (Entropy) Decision Tree Results:')
print('Accuracy:', accuracy_score(y_test, y_pred_id3))
print(classification_report(y_test, y_pred_id3))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_id3))

# CART (criterion='gini')
dt_cart = DecisionTreeClassifier(criterion='gini', random_state=42)
dt_cart.fit(X_train, y_train)
y_pred_cart = dt_cart.predict(X_test)

print('CART (Gini) Decision Tree Results:')
print('Accuracy:', accuracy_score(y_test, y_pred_cart))
print(classification_report(y_test, y_pred_cart))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_cart))

## 4. Naive Bayes

Train and evaluate a Naive Bayes classifier.

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

print('Naive Bayes Results:')
print('Accuracy:', accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_nb))

## 5. Support Vector Machine (SVM)

Train and evaluate a Support Vector Machine classifier.

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print('Support Vector Machine Results:')
print('Accuracy:', accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_svm))

## 6. Model Comparison

Compare the performance of all classifiers using accuracy and classification metrics.

In [None]:
# Compare model accuracies
results = {
    'ID3 Decision Tree': accuracy_score(y_test, y_pred_id3),
    'CART Decision Tree': accuracy_score(y_test, y_pred_cart),
    'Naive Bayes': accuracy_score(y_test, y_pred_nb),
    'SVM': accuracy_score(y_test, y_pred_svm)
}

results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
print(results_df)

# Bar plot for comparison
plt.figure(figsize=(8,5))
sns.barplot(x='Model', y='Accuracy', data=results_df)
plt.title('Classifier Accuracy Comparison')
plt.ylim(0,1)
plt.show()