In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("/dermatology_database_1.csv")

# Convert 'age' to numeric and handle errors
data['age'] = pd.to_numeric(data['age'], errors='coerce')

# Drop rows with missing values
data = data.dropna().reset_index(drop=True)

# Split features and target variable
X = data.drop(columns=["class"])
y = data["class"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate SVM
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_preds)

# Train and evaluate Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)

print("SVM Accuracy:", svm_accuracy)
print("Random Forest Accuracy:", rf_accuracy)
print("\nSVM Classification Report:\n", classification_report(y_test, svm_preds))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_preds))

# -----------------------------------
# Outlier Detection and Removal
# -----------------------------------
# Define outliers as values outside 3 standard deviations
z_scores = np.abs((X - X.mean()) / X.std())
outlier_indices = np.where(z_scores > 3)[0]

# Remove outliers
X_cleaned = X.drop(outlier_indices, axis=0)
y_cleaned = y.drop(outlier_indices, axis=0)

# Split cleaned data
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

# Retrain SVM
svm_model_clean = SVC(random_state=42)
svm_model_clean.fit(X_train_clean, y_train_clean)
svm_preds_clean = svm_model_clean.predict(X_test_clean)
svm_accuracy_clean = accuracy_score(y_test_clean, svm_preds_clean)

# Retrain Random Forest
rf_model_clean = RandomForestClassifier(random_state=42)
rf_model_clean.fit(X_train_clean, y_train_clean)
rf_preds_clean = rf_model_clean.predict(X_test_clean)
rf_accuracy_clean = accuracy_score(y_test_clean, rf_preds_clean)

print("\nAfter Removing Outliers:")
print("SVM Accuracy (Cleaned):", svm_accuracy_clean)
print("Random Forest Accuracy (Cleaned):", rf_accuracy_clean)

# -----------------------------------
# Plotting Histograms
# -----------------------------------
data.hist(bins=20, figsize=(20, 15))
plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/dermatology_database_1.csv'