# Data Mining Project

This notebook is designed to work in Google Colab for data mining tasks.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/FarnoodTavasoli/datamining_project/blob/main/data_mining_project.ipynb)

## Setup for Google Colab

This section sets up the environment when running on Google Colab.

In [None]:
# Check if running on Google Colab
try:
    from google.colab import drive
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    # Mount Google Drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully!")
else:
    print("Running locally")

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## Data Loading and Exploration

Loading the Ionosphere dataset and performing initial exploration.

In [None]:
# Load the Ionosphere dataset
if IN_COLAB:
    # Update this path to point to your uploaded files folder in Google Drive
    data_path = '/content/drive/MyDrive/datamining_project/ionosphere.data'
else:
    # Local path
    data_path = 'files/ionosphere_5/ionosphere.data'

# Column names for the dataset
# 34 continuous features + 1 target variable
column_names = [f'feature_{i}' for i in range(1, 35)] + ['class']

# Load data
df = pd.read_csv(data_path, header=None, names=column_names)

print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic dataset information
print("="*60)
print("DATASET INFORMATION")
print("="*60)
print(f"\nNumber of instances: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1}")
print(f"\nData types:")
print(df.dtypes.value_counts())
print(f"\nMissing values:")
print(df.isnull().sum().sum())
print(f"\nClass distribution:")
print(df['class'].value_counts())
print(f"\nClass proportions:")
print(df['class'].value_counts(normalize=True))

In [None]:
# Statistical summary
print("Statistical Summary of Features:")
df.describe().T

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
df['class'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Pie chart
df['class'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                                colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[1].set_title('Class Proportion', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Visualize feature distributions (all 34 features)
num_features = 34
n_cols = 6
n_rows = int(np.ceil(num_features / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))
axes = axes.ravel()

for i in range(num_features):
    feature_name = f'feature_{i+1}'
    axes[i].hist(df[feature_name], bins=30, alpha=0.7, color='steelblue', edgecolor='black')
    axes[i].set_title(f'{feature_name}', fontsize=9)
    axes[i].set_xlabel('Value', fontsize=8)
    axes[i].set_ylabel('Frequency', fontsize=8)

# Hide any unused subplots
for j in range(num_features, len(axes)):
    axes[j].axis('off')

plt.suptitle('Distribution of All 34 Features', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap (all 34 features)
plt.figure(figsize=(18, 14))
correlation_matrix = df.iloc[:, :34].corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0,
            linewidths=0.3, cbar_kws={'label': 'Correlation'})
plt.title('Correlation Heatmap (All 34 Features)', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## Data Preprocessing

Preparing the data for machine learning models.

In [None]:
# Separate features and target
X = df.drop('class', axis=1)
y = df['class']

# Encode target variable (g=good, b=bad)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y_encoded.shape}")
print(f"\nClass encoding:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"  {class_name} -> {i}")

In [None]:
# Split data into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set class distribution:")
print(pd.Series(y_test).value_counts())

In [None]:
# Feature scaling (standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"\nScaled training data shape: {X_train_scaled.shape}")
print(f"Scaled test data shape: {X_test_scaled.shape}")
print(f"\nSample scaled features (first 5):")
print(X_train_scaled[:5, :5])