# Adult Dataset - Exploratory Data Analysis and Classification

This notebook downloads the Adult dataset from the UCI Machine Learning Repository, performs exploratory data analysis, and builds classification models to predict whether income exceeds $50K/year.

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Fetch dataset from UCI repository
print("Downloading Adult dataset from UCI repository...")
adult = fetch_ucirepo(id=2)

# Get features and target
X = adult.data.features
y = adult.data.targets

# Combine into single dataframe for EDA
df = pd.concat([X, y], axis=1)

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")

## 2. Initial Data Exploration

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Dataset information
print("Dataset Info:")
df.info()

In [None]:
# Statistical summary
print("Statistical Summary of Numerical Features:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing[missing > 0]

In [None]:
# Check for duplicates
print(f"Number of duplicate rows: {df.duplicated().sum()}")

In [None]:
# Target variable distribution
print("Target Variable Distribution:")
print(df['income'].value_counts())
print(f"\nPercentage distribution:")
print(df['income'].value_counts(normalize=True) * 100)

## 3. Exploratory Data Analysis (EDA)

### 3.1 Target Variable Visualization

In [None]:
# Visualize target variable distribution
plt.figure(figsize=(8, 6))
df['income'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Income', fontsize=14, fontweight='bold')
plt.xlabel('Income', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

### 3.2 Numerical Features Analysis

In [None]:
# Identify numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'income' in numerical_cols:
    numerical_cols.remove('income')

print(f"Numerical columns: {numerical_cols}")

In [None]:
# Distribution of numerical features
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols[:6]):
    axes[idx].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Box plots for numerical features by income
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols[:6]):
    df.boxplot(column=col, by='income', ax=axes[idx])
    axes[idx].set_title(f'{col} by Income')
    axes[idx].set_xlabel('Income')
    axes[idx].set_ylabel(col)

plt.suptitle('')
plt.tight_layout()
plt.show()

### 3.3 Categorical Features Analysis

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'income' in categorical_cols:
    categorical_cols.remove('income')

print(f"Categorical columns: {categorical_cols}")

In [None]:
# Count plots for categorical features
n_cols = 3
n_rows = (len(categorical_cols) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.ravel()

for idx, col in enumerate(categorical_cols):
    if idx < len(axes):
        df[col].value_counts().plot(kind='bar', ax=axes[idx], color='steelblue')
        axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Count')
        axes[idx].tick_params(axis='x', rotation=45)

# Hide empty subplots
for idx in range(len(categorical_cols), len(axes)):
    axes[idx].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Income distribution by categorical features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

important_cats = ['workclass', 'education', 'marital-status', 'occupation']
for idx, col in enumerate(important_cats[:4]):
    if col in df.columns:
        pd.crosstab(df[col], df['income'], normalize='index').plot(kind='bar', ax=axes[idx], stacked=True)
        axes[idx].set_title(f'Income Distribution by {col}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Proportion')
        axes[idx].legend(title='Income')
        axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### 3.4 Correlation Analysis

In [None]:
# Correlation matrix for numerical features
plt.figure(figsize=(10, 8))
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Handle missing values (if any) by dropping rows with missing values
df_processed = df_processed.dropna()

print(f"Shape after handling missing values: {df_processed.shape}")

In [None]:
# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])
    label_encoders[col] = le

# Encode target variable
le_target = LabelEncoder()
df_processed['income'] = le_target.fit_transform(df_processed['income'])

print("Encoding completed!")
print(f"Target classes: {le_target.classes_}")

In [None]:
# Prepare features and target
X = df_processed.drop('income', axis=1)
y = df_processed['income']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")

## 5. Model Training and Evaluation

### 5.1 Logistic Regression

In [None]:
# Train Logistic Regression model
print("Training Logistic Regression model...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate model
print("\nLogistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_lr):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=le_target.classes_))

### 5.2 Random Forest Classifier

In [None]:
# Train Random Forest model
print("Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate model
print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_rf):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=le_target.classes_))

### 5.3 Gradient Boosting Classifier

In [None]:
# Train Gradient Boosting model
print("Training Gradient Boosting model...")
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test)
y_pred_proba_gb = gb_model.predict_proba(X_test)[:, 1]

# Evaluate model
print("\nGradient Boosting Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_gb):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb, target_names=le_target.classes_))

## 6. Model Comparison

In [None]:
# Compare model performance
models_performance = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gradient Boosting'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_gb)
    ],
    'ROC-AUC': [
        roc_auc_score(y_test, y_pred_proba_lr),
        roc_auc_score(y_test, y_pred_proba_rf),
        roc_auc_score(y_test, y_pred_proba_gb)
    ]
})

print("Model Performance Comparison:")
print(models_performance)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy comparison
axes[0].bar(models_performance['Model'], models_performance['Accuracy'], color=['skyblue', 'lightgreen', 'salmon'])
axes[0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy')
axes[0].set_ylim([0.7, 0.9])
axes[0].tick_params(axis='x', rotation=45)

# ROC-AUC comparison
axes[1].bar(models_performance['Model'], models_performance['ROC-AUC'], color=['skyblue', 'lightgreen', 'salmon'])
axes[1].set_title('Model ROC-AUC Comparison', fontsize=14, fontweight='bold')
axes[1].set_ylabel('ROC-AUC Score')
axes[1].set_ylim([0.7, 0.95])
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### 6.1 Confusion Matrices

In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

models_preds = [
    ('Logistic Regression', y_pred_lr),
    ('Random Forest', y_pred_rf),
    ('Gradient Boosting', y_pred_gb)
]

for idx, (model_name, y_pred) in enumerate(models_preds):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx], 
                xticklabels=le_target.classes_, yticklabels=le_target.classes_)
    axes[idx].set_title(f'{model_name}\nConfusion Matrix', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('Actual')

plt.tight_layout()
plt.show()

### 6.2 ROC Curves

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

models_proba = [
    ('Logistic Regression', y_pred_proba_lr),
    ('Random Forest', y_pred_proba_rf),
    ('Gradient Boosting', y_pred_proba_gb)
]

for model_name, y_pred_proba in models_proba:
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    auc = roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.4f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=2)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

### 6.3 Feature Importance (Random Forest)

In [None]:
# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'][:15], feature_importance['importance'][:15], color='steelblue')
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 15 Feature Importance (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

## 7. Summary and Conclusions

### Key Findings:

1. **Dataset**: The Adult dataset contains demographic and employment information to predict whether an individual earns more than $50K/year.

2. **Data Characteristics**: The dataset is imbalanced with more individuals earning <=50K than >50K.

3. **Model Performance**: All three models achieved good performance:
   - Logistic Regression provides a good baseline
   - Random Forest and Gradient Boosting typically achieve higher accuracy
   - The best model can be selected based on the specific requirements (accuracy vs interpretability)

4. **Important Features**: Features like education, age, capital-gain, and occupation are typically the most important predictors.

5. **Next Steps**: 
   - Hyperparameter tuning for better performance
   - Handle class imbalance with techniques like SMOTE
   - Try ensemble methods
   - Deploy the best model