# Data Mining Project

This notebook is designed to work in Google Colab for data mining tasks.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FarnoodTavasoli/datamining_project/blob/main/data_mining_project.ipynb)

## Setup for Google Colab

This section sets up the environment when running on Google Colab.

In [None]:
# Check Colab
try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    drive.mount('/content/drive',force_remount=True)
    print("Google Drive mounted successfully!")
else:
    print("Running locally")

In [None]:
# all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## Data Loading and Exploration

Loading the Ionosphere dataset and performing initial exploration.

In [None]:
# Load the Ionosphere dataset
if IN_COLAB:
    data_path = '/content/drive/MyDrive/datamining_project/ionosphere.data'
else:
    data_path = 'files/ionosphere_5/ionosphere.data'


column_names = [f'feature_{i}' for i in range(1, 35)] + ['class']

df = pd.read_csv(data_path, header=None, names=column_names)

print(f"Dataset loaded successfully!")
df.head()

In [None]:
# basic dataset info
print("="*60)
print("DATASET INFORMATION")
print("="*60)
print(f"\Rows: {df.shape[0]}")
print(f"Features: {df.shape[1] - 1}")
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"\nClass distribution:")
print(df['class'].value_counts())

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
df['class'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Pie chart
df['class'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                                colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[1].set_title('Class Proportion', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Visualize feature distributions
num_features = 34
n_cols = 6
n_rows = int(np.ceil(num_features / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))
axes = axes.ravel()

for i in range(num_features):
    feature_name = f'feature_{i+1}'
    axes[i].hist(df[feature_name], bins=30, alpha=0.7, color='steelblue', edgecolor='black')
    axes[i].set_title(f'{feature_name}', fontsize=9)
    axes[i].set_xlabel('Value', fontsize=8)
    axes[i].set_ylabel('Frequency', fontsize=8)

# Hide unused subplots
for j in range(num_features, len(axes)):
    axes[j].axis('off')

plt.suptitle('Distribution of Features', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(18, 14))
correlation_matrix = df.iloc[:, :34].corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0,
            linewidths=0.3, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## Data Preprocessing

Preparing the data for machine learning models.

In [None]:
# Separate features and target
X = df.drop('class', axis=1)
y = df['class']

# Encode target variable (g=good, b=bad)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"\nClass encoding:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"  {class_name} -> {i}")

In [None]:
# Remove constant features (columns with no variety)
# Identify features where all non-null values are identical
constant_features = []
for column in X.columns:
    if X[column].nunique() == 1:
        constant_features.append(column)

# Drop constant features
if constant_features:
    print(f"Found {len(constant_features)} constant feature(s) with no variety:")
    for feature in constant_features:
        print(f"  - {feature}: {X[feature].unique()[0]}")
    
    X = X.drop(columns=constant_features)
    print(f"\nConstant features dropped!")
    print(f"Features shape after removing constant features: {X.shape}")
else:
    print("No constant features found. All features have variety.")

print(f"\nRemaining features shape: {X.shape}")


In [None]:
# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set class distribution:")
print(pd.Series(y_test).value_counts())

In [None]:
# Feature scaling (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"\nScaled training data shape: {X_train_scaled.shape}")
print(f"Scaled test data shape: {X_test_scaled.shape}")

## Preprocessing Results Visualization

Visualizing how the data has changed after preprocessing and scaling.

In [None]:
# Compare original vs scaled features statistics - ALL DATA
comparison_stats = pd.DataFrame({
    'Feature': [f'F{i+1}' for i in range(X_train.shape[1])],
    'Original Mean': X_train.mean().values,
    'Original Std': X_train.std().values,
    'Scaled Mean': X_train_scaled.mean(axis=0),
    'Scaled Std': X_train_scaled.std(axis=0)
})

print("="*80)
print("DATA STATISTICS: ORIGINAL vs SCALED (All Features)")
print("="*80)
print(comparison_stats.round(4).to_string(index=False))
print("\n")

# Statistics of scaled data
print("="*80)
print("SCALED TRAINING DATA SUMMARY")
print("="*80)
print(f"Overall Mean (should be ~0): {X_train_scaled.mean():.6f}")
print(f"Overall Std Dev (should be ~1): {X_train_scaled.std():.6f}")
print(f"Min value: {X_train_scaled.min():.4f}")
print(f"Max value: {X_train_scaled.max():.4f}")


In [None]:
# Statistical summary table
print("="*80)
print("DATA PREPROCESSING SUMMARY")
print("="*80)

summary_data = {
    'Metric': [
        'Total Samples',
        'Training Samples',
        'Test Samples',
        'Number of Features',
        'Class 0 (Bad)',
        'Class 1 (Good)',
        'Feature Scaling',
        'Scaled Data Mean',
        'Scaled Data Std Dev'
    ],
    'Value': [
        f"{len(df)}",
        f"{len(X_train)}",
        f"{len(X_test)}",
        f"{X_train.shape[1]}",
        f"{(y_train == 0).sum()}",
        f"{(y_train == 1).sum()}",
        "StandardScaler",
        f"{X_train_scaled.mean():.6f}",
        f"{X_train_scaled.std():.6f}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
print("\n")


In [None]:
# Train/Test split visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Split proportions
split_labels = ['Train Set', 'Test Set']
split_sizes = [len(X_train), len(X_test)]
colors_split = ['#3498db', '#e74c3c']

axes[0].bar(split_labels, split_sizes, color=colors_split, edgecolor='black', linewidth=2)
axes[0].set_title('Train/Test Split Distribution', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Number of Samples', fontsize=11)
for i, v in enumerate(split_sizes):
    axes[0].text(i, v + 5, str(v), ha='center', fontweight='bold', fontsize=11)

# Class distribution in train and test sets
train_class_dist = pd.Series(y_train).value_counts().sort_index()
test_class_dist = pd.Series(y_test).value_counts().sort_index()
class_names = ['Bad (0)', 'Good (1)']

x = np.arange(len(class_names))
width = 0.35

axes[1].bar(x - width/2, train_class_dist.values, width, label='Train Set', color='#3498db', edgecolor='black')
axes[1].bar(x + width/2, test_class_dist.values, width, label='Test Set', color='#e74c3c', edgecolor='black')
axes[1].set_title('Class Distribution in Train/Test Sets', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Number of Samples', fontsize=11)
axes[1].set_xticks(x)
axes[1].set_xticklabels(class_names)
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


In [None]:
# Visualize boxplots comparing original vs scaled data - ALL FEATURES
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Original data boxplot
num_features = X_train.shape[1]
axes[0].boxplot([X_train.iloc[:, i] for i in range(num_features)], labels=[f'F{i+1}' for i in range(num_features)])
axes[0].set_title(f'Original Features Boxplot (All {num_features} Features)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Value')
axes[0].set_xlabel('Features')
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Scaled data boxplot
axes[1].boxplot([X_train_scaled[:, i] for i in range(num_features)], labels=[f'F{i+1}' for i in range(num_features)])
axes[1].set_title(f'Scaled Features Boxplot (All {num_features} Features)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Value')
axes[1].set_xlabel('Features')
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Visualize ALL feature distributions - original vs scaled
num_features = X_train.shape[1]
# Calculate subplot grid: one per feature
num_cols = 4
num_rows = (num_features + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, 4*num_rows))
axes = axes.ravel()

for i in range(num_features):
    # Plot original
    axes[i].hist(X_train.iloc[:, i], bins=30, alpha=0.6, label='Original', color='steelblue', edgecolor='black')
    # Plot scaled on secondary axis
    ax2 = axes[i].twinx()
    ax2.hist(X_train_scaled[:, i], bins=30, alpha=0.6, label='Scaled', color='orange', edgecolor='black')
    
    axes[i].set_title(f'Feature {i+1}', fontsize=10, fontweight='bold')
    axes[i].set_xlabel('Value', fontsize=9)
    axes[i].set_ylabel('Frequency (Original)', color='steelblue', fontsize=9)
    ax2.set_ylabel('Frequency (Scaled)', color='orange', fontsize=9)
    axes[i].tick_params(axis='y', labelcolor='steelblue')
    ax2.tick_params(axis='y', labelcolor='orange')
    axes[i].grid(True, alpha=0.3)

# Hide unused subplots
for i in range(num_features, len(axes)):
    axes[i].axis('off')

plt.suptitle(f'All {num_features} Features: Original vs Scaled Distributions', fontsize=14, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()


## Cross Validation effect on Decision Tree

Compare four Decision Tree training strategies and evaluate them on the test set.

In [None]:
# Helper to evaluate a model on the test set
# TPR = Recall, FPR = FP / (FP + TN)
def evaluate_on_test(model, X_test, y_test):
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    tpr = tp / (tp + fn) if (tp + fn) else 0.0
    fpr = fp / (fp + tn) if (fp + tn) else 0.0
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall (TPR)': tpr,
        'F1': f1_score(y_test, y_pred),
        'FPR': fpr
    }

print("="*80)
print("DECISION TREE: 4 TRAINING STRATEGIES")
print("="*80)

# 1) Normal training (no CV, no pruning)
base_tree = DecisionTreeClassifier(random_state=42)
base_tree.fit(X_train, y_train)

# 2) Normal training with 10-fold CV (no pruning)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(DecisionTreeClassifier(random_state=42), X_train, y_train, cv=cv, scoring='accuracy')
cv_tree = DecisionTreeClassifier(random_state=42)
cv_tree.fit(X_train, y_train)

# 3) Normal training with CCP post-pruning (alpha chosen by simple validation on train)
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)
path = DecisionTreeClassifier(random_state=42).cost_complexity_pruning_path(X_train_sub, y_train_sub)
ccp_alphas = path.ccp_alphas

val_scores = []
for alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=42, ccp_alpha=alpha)
    clf.fit(X_train_sub, y_train_sub)
    val_scores.append(accuracy_score(y_val, clf.predict(X_val)))

best_alpha_val = ccp_alphas[int(np.argmax(val_scores))]
ccp_tree = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha_val)
ccp_tree.fit(X_train, y_train)

# 4) 10-fold CV with CCP (alpha chosen by CV on train)
cv_alpha_scores = []
for alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=42, ccp_alpha=alpha)
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
    cv_alpha_scores.append(scores.mean())

best_alpha_cv = ccp_alphas[int(np.argmax(cv_alpha_scores))]
ccp_cv_tree = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha_cv)
ccp_cv_tree.fit(X_train, y_train)

# Compare all models on test set
results = []

results.append({
    'Model': 'Normal Training',
    'CV Accuracy (train)': None,
    'CCP Alpha': None,
    **evaluate_on_test(base_tree, X_test, y_test)
})

results.append({
    'Model': '10-Fold CV Training',
    'CV Accuracy (train)': cv_scores.mean(),
    'CCP Alpha': None,
    **evaluate_on_test(cv_tree, X_test, y_test)
})

results.append({
    'Model': 'Normal + CCP Pruning',
    'CV Accuracy (train)': None,
    'CCP Alpha': best_alpha_val,
    **evaluate_on_test(ccp_tree, X_test, y_test)
})

results.append({
    'Model': '10-Fold CV + CCP',
    'CV Accuracy (train)': max(cv_alpha_scores),
    'CCP Alpha': best_alpha_cv,
    **evaluate_on_test(ccp_cv_tree, X_test, y_test)
})

results_df = pd.DataFrame(results)

# Nice formatting
results_df['CV Accuracy (train)'] = results_df['CV Accuracy (train)'].round(4)
results_df['CCP Alpha'] = results_df['CCP Alpha'].apply(lambda x: None if x is None else round(float(x), 6))
for col in ['Accuracy', 'Precision', 'Recall (TPR)', 'F1', 'FPR']:
    results_df[col] = results_df[col].round(4)

print(results_df.to_string(index=False))

# Highlight best model by test accuracy
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest model by test accuracy:")
print(best_model.to_string())
