# Data Mining Project

This notebook is designed to work in Google Colab for data mining tasks.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FarnoodTavasoli/datamining_project/blob/main/data_mining_project.ipynb)

## Setup for Google Colab

This section sets up the environment when running on Google Colab.

In [None]:
# Check if running on Google Colab
try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    # Mount Google Drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully!")
else:
    print("Running locally")

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## Data Loading and Exploration

Loading the Ionosphere dataset and performing initial exploration.

In [None]:
# Load the Ionosphere dataset
if IN_COLAB:
    # Update this path to point to your uploaded files folder in Google Drive
    data_path = '/content/drive/MyDrive/datamining_project/ionosphere.data'
else:
    # Local path
    data_path = 'files/ionosphere_5/ionosphere.data'

# Column names for the dataset
# 34 continuous features + 1 target variable
column_names = [f'feature_{i}' for i in range(1, 35)] + ['class']

# Load data
df = pd.read_csv(data_path, header=None, names=column_names)

print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic dataset information
print("="*60)
print("DATASET INFORMATION")
print("="*60)
print(f"\nNumber of instances: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1}")
print(f"\nData types:")
print(df.dtypes.value_counts())
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"\nClass distribution:")
print(df['class'].value_counts())
print(f"\nClass proportions:")
print(df['class'].value_counts(normalize=True))

In [None]:
# Statistical summary
print("Statistical Summary of Features:")
df.describe().T

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
df['class'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Pie chart
df['class'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                                colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[1].set_title('Class Proportion', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Visualize feature distributions (all 34 features)
num_features = 34
n_cols = 6
n_rows = int(np.ceil(num_features / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))
axes = axes.ravel()

for i in range(num_features):
    feature_name = f'feature_{i+1}'
    axes[i].hist(df[feature_name], bins=30, alpha=0.7, color='steelblue', edgecolor='black')
    axes[i].set_title(f'{feature_name}', fontsize=9)
    axes[i].set_xlabel('Value', fontsize=8)
    axes[i].set_ylabel('Frequency', fontsize=8)

# Hide any unused subplots
for j in range(num_features, len(axes)):
    axes[j].axis('off')

plt.suptitle('Distribution of All 34 Features', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap (all 34 features)
plt.figure(figsize=(18, 14))
correlation_matrix = df.iloc[:, :34].corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0,
            linewidths=0.3, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Heatmap (All 34 Features)', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## Data Preprocessing

Preparing the data for machine learning models.

In [None]:
# Separate features and target
X = df.drop('class', axis=1)
y = df['class']

# Encode target variable (g=good, b=bad)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y_encoded.shape}")
print(f"\nClass encoding:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"  {class_name} -> {i}")

In [None]:
# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set class distribution:")
print(pd.Series(y_test).value_counts())

In [None]:
# Feature scaling (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"\nScaled training data shape: {X_train_scaled.shape}")
print(f"Scaled test data shape: {X_test_scaled.shape}")
print(f"\nSample scaled features (first 5):")
print(X_train_scaled[:5, :5])

## Model Training and Evaluation

### Customization Parameter: d = 2

We'll train multiple classifiers with customization parameter d=2 where applicable:
- **K-Nearest Neighbors (KNN)**: Using k=2 neighbors
- **Decision Tree**: With max_depth=2
- **Support Vector Machine (SVM)**: With degree=2 polynomial kernel
- **Neural Network (MLP)**: With 2 hidden layers

In [None]:
# Customization parameter
d = 2
print(f"Customization parameter d = {d}")

### 1. K-Nearest Neighbors (KNN) with k=2

In [None]:
# Train KNN with k=d
knn_model = KNeighborsClassifier(n_neighbors=d)
knn_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_knn = knn_model.predict(X_test_scaled)

# Evaluate
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)

print(f"KNN (k={d}) Performance:")
print(f"  Accuracy:  {knn_accuracy:.4f}")
print(f"  Precision: {knn_precision:.4f}")
print(f"  Recall:    {knn_recall:.4f}")
print(f"  F1-Score:  {knn_f1:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_knn, target_names=label_encoder.classes_))

### 2. Decision Tree with max_depth=2

In [None]:
# Train Decision Tree with max_depth=d
dt_model = DecisionTreeClassifier(max_depth=d, random_state=42)
dt_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test_scaled)

# Evaluate
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)

print(f"Decision Tree (max_depth={d}) Performance:")
print(f"  Accuracy:  {dt_accuracy:.4f}")
print(f"  Precision: {dt_precision:.4f}")
print(f"  Recall:    {dt_recall:.4f}")
print(f"  F1-Score:  {dt_f1:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_))

### 3. Support Vector Machine (SVM) with polynomial degree=2

In [None]:
# Train SVM with polynomial kernel degree=d
svm_model = SVC(kernel='poly', degree=d, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm)

print(f"SVM (polynomial degree={d}) Performance:")
print(f"  Accuracy:  {svm_accuracy:.4f}")
print(f"  Precision: {svm_precision:.4f}")
print(f"  Recall:    {svm_recall:.4f}")
print(f"  F1-Score:  {svm_f1:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

### 4. Multi-Layer Perceptron (MLP) with 2 hidden layers

In [None]:
# Train MLP with d hidden layers
# Using (100, 50) neurons for 2 hidden layers
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
mlp_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_mlp = mlp_model.predict(X_test_scaled)

# Evaluate
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
mlp_precision = precision_score(y_test, y_pred_mlp)
mlp_recall = recall_score(y_test, y_pred_mlp)
mlp_f1 = f1_score(y_test, y_pred_mlp)

print(f"MLP ({d} hidden layers) Performance:")
print(f"  Accuracy:  {mlp_accuracy:.4f}")
print(f"  Precision: {mlp_precision:.4f}")
print(f"  Recall:    {mlp_recall:.4f}")
print(f"  F1-Score:  {mlp_f1:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_mlp, target_names=label_encoder.classes_))

## Model Comparison and Visualization

In [None]:
# Compile results
results = {
    'Model': ['KNN (k=2)', 'Decision Tree (depth=2)', 'SVM (poly deg=2)', 'MLP (2 layers)'],
    'Accuracy': [knn_accuracy, dt_accuracy, svm_accuracy, mlp_accuracy],
    'Precision': [knn_precision, dt_precision, svm_precision, mlp_precision],
    'Recall': [knn_recall, dt_recall, svm_recall, mlp_recall],
    'F1-Score': [knn_f1, dt_f1, svm_f1, mlp_f1]
}

results_df = pd.DataFrame(results)
print("Model Performance Comparison:")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']

for idx, (ax, metric) in enumerate(zip(axes.ravel(), metrics)):
    bars = ax.bar(results_df['Model'], results_df[metric], color=colors[idx], alpha=0.8, edgecolor='black')
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_ylabel(metric, fontsize=12)
    ax.set_ylim(0, 1.1)
    ax.set_xticklabels(results_df['Model'], rotation=45, ha='right')
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.suptitle(f'Model Performance Comparison (d={d})', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

In [None]:
# Radar chart for comprehensive comparison
from math import pi

categories = metrics
N = len(categories)

# Create angles for each metric
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))

# Plot each model
model_colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
for i, model in enumerate(results_df['Model']):
    values = results_df.iloc[i, 1:].values.tolist()
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=model, color=model_colors[i])
    ax.fill(angles, values, alpha=0.15, color=model_colors[i])

# Fix axis to go in the right order
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, size=12)
ax.set_ylim(0, 1)
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], size=10)
ax.grid(True)

plt.title(f'Model Performance Radar Chart (d={d})', size=16, fontweight='bold', pad=20)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.show()

In [None]:
# Confusion matrices for all models
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

models_predictions = [
    ('KNN (k=2)', y_pred_knn),
    ('Decision Tree (depth=2)', y_pred_dt),
    ('SVM (poly deg=2)', y_pred_svm),
    ('MLP (2 layers)', y_pred_mlp)
]

for ax, (model_name, y_pred) in zip(axes.ravel(), models_predictions):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, 
                xticklabels=label_encoder.classes_, 
                yticklabels=label_encoder.classes_,
                cbar_kws={'label': 'Count'})
    ax.set_title(f'Confusion Matrix - {model_name}', fontsize=12, fontweight='bold')
    ax.set_ylabel('True Label', fontsize=11)
    ax.set_xlabel('Predicted Label', fontsize=11)

plt.suptitle(f'Confusion Matrices for All Models (d={d})', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

In [None]:
# Identify best model
best_model_idx = results_df['Accuracy'].idxmax()
best_model = results_df.iloc[best_model_idx]

print("="*80)
print("BEST PERFORMING MODEL")
print("="*80)
print(f"Model: {best_model['Model']}")
print(f"Accuracy:  {best_model['Accuracy']:.4f}")
print(f"Precision: {best_model['Precision']:.4f}")
print(f"Recall:    {best_model['Recall']:.4f}")
print(f"F1-Score:  {best_model['F1-Score']:.4f}")
print("="*80)

## Cross-Validation Analysis

Perform k-fold cross-validation to assess model stability and generalization.

In [None]:
# Perform 5-fold cross-validation for all models
models = [
    ('KNN (k=2)', knn_model),
    ('Decision Tree (depth=2)', dt_model),
    ('SVM (poly deg=2)', svm_model),
    ('MLP (2 layers)', mlp_model)
]

cv_results = []

print("Cross-Validation Results (5-fold):")
print("="*80)

for model_name, model in models:
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_results.append({
        'Model': model_name,
        'Mean CV Score': scores.mean(),
        'Std CV Score': scores.std(),
        'Min Score': scores.min(),
        'Max Score': scores.max()
    })
    print(f"\n{model_name}:")
    print(f"  Mean Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
    print(f"  Individual Fold Scores: {[f'{s:.4f}' for s in scores]}")

cv_results_df = pd.DataFrame(cv_results)
print("\n" + "="*80)
print("\nSummary:")
print(cv_results_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize cross-validation results
fig, ax = plt.subplots(figsize=(12, 6))

x_pos = np.arange(len(cv_results_df))
means = cv_results_df['Mean CV Score']
stds = cv_results_df['Std CV Score']

bars = ax.bar(x_pos, means, yerr=stds, capsize=5, alpha=0.8, 
              color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'], 
              edgecolor='black', linewidth=1.5)

ax.set_xlabel('Model', fontsize=12, fontweight='bold')
ax.set_ylabel('Cross-Validation Accuracy', fontsize=12, fontweight='bold')
ax.set_title(f'5-Fold Cross-Validation Results (d={d})', fontsize=14, fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(cv_results_df['Model'], rotation=45, ha='right')
ax.set_ylim(0, 1.1)
ax.grid(axis='y', alpha=0.3)

# Add value labels
for i, (bar, mean, std) in enumerate(zip(bars, means, stds)):
    ax.text(bar.get_x() + bar.get_width()/2., bar.get_height(),
            f'{mean:.3f}\nÂ±{std:.3f}',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

## Conclusion

This data mining project analyzed the **Ionosphere dataset** using four different classification algorithms with customization parameter **d=2**:

1. **K-Nearest Neighbors (KNN)** with k=2 neighbors
2. **Decision Tree** with max_depth=2
3. **Support Vector Machine (SVM)** with polynomial kernel degree=2
4. **Multi-Layer Perceptron (MLP)** with 2 hidden layers

### Key Findings:
- The dataset contains 351 instances with 34 continuous features
- Binary classification: "good" vs "bad" radar returns
- All models were evaluated using accuracy, precision, recall, and F1-score
- Cross-validation confirmed model stability and generalization capability

### Recommendations:
- The best-performing model (based on test accuracy) can be used for production
- Consider experimenting with different values of d to optimize performance
- Feature engineering and selection could further improve results
- Ensemble methods could be explored for better performance