In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# COMPAS Data Exploration Notebook\n",
    "## AI Ethics Assignment - Exploratory Data Analysis\n",
    "\n",
    "This notebook explores the COMPAS recidivism dataset to understand its structure, distributions, and characteristics before bias analysis."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "\n",
    "# Set style\n",
    "sns.set_style('whitegrid')\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "\n",
    "print('✓ Libraries imported')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load COMPAS data from local CSV\n",
    "csv_path = Path('data/compas_raw/compas.csv')\n",
    "df = pd.read_csv(csv_path)\n",
    "\n",
    "print(f'Dataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns')\n",
    "print(f'File: {csv_path}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display first few rows\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataset info\n",
    "print('Column Names and Types:')\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for missing values\n",
    "print('Missing Values:')\n",
    "print(df.isnull().sum())\n",
    "print(f'\\nTotal missing: {df.isnull().sum().sum()}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demographic Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Race distribution\n",
    "print('Race Distribution:')\n",
    "race_counts = df['race'].value_counts()\n",
    "print(race_counts)\n",
    "print(f'\\nPercentages:')\n",
    "print(race_counts / len(df) * 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize race distribution\n",
    "plt.figure(figsize=(10, 6))\n",
    "race_counts.plot(kind='bar', color='steelblue', alpha=0.7, edgecolor='black')\n",
    "plt.title('COMPAS Dataset: Race Distribution', fontsize=14, fontweight='bold')\n",
    "plt.ylabel('Count')\n",
    "plt.xlabel('Race')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gender distribution\n",
    "print('Sex Distribution:')\n",
    "print(df['sex'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Age statistics\n",
    "print('Age Statistics:')\n",
    "print(df['age'].describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Recidivism Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Handle column name variation\n",
    "if 'two_year_recidivism' in df.columns:\n",
    "    recid_col = 'two_year_recidivism'\n",
    "elif 'two_year_recid' in df.columns:\n",
    "    recid_col = 'two_year_recid'\n",
    "else:\n",
    "    print('No recidivism column found!')\n",
    "    recid_col = None\n",
    "\n",
    "if recid_col:\n",
    "    print(f'Using column: {recid_col}')\n",
    "    print(f'\\nRecidivism Distribution:')\n",
    "    print(df[recid_col].value_counts())\n",
    "    print(f'\\nRecidivism Rate: {df[recid_col].mean():.1%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recidivism by race\n",
    "if recid_col:\n",
    "    print('Recidivism Rate by Race:')\n",
    "    recid_by_race = df.groupby('race')[recid_col].agg(['sum', 'count', 'mean'])\n",
    "    recid_by_race.columns = ['Reoffended', 'Total', 'Rate']\n",
    "    print(recid_by_race)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize recidivism by race\n",
    "if recid_col:\n",
    "    fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "    \n",
    "    # Count by race\n",
    "    df.groupby(['race', recid_col]).size().unstack().plot(\n",
    "        kind='bar', ax=axes[0], color=['green', 'red'], alpha=0.7, edgecolor='black'\n",
    "    )\n",
    "    axes[0].set_title('Recidivism Count by Race', fontweight='bold')\n",
    "    axes[0].set_ylabel('Count')\n",
    "    axes[0].set_xlabel('Race')\n",
    "    axes[0].legend(['No Recidivism', 'Recidivism'], title='Outcome')\n",
    "    axes[0].tick_params(axis='x', rotation=45)\n",
    "    \n",
    "    # Rate by race\n",
    "    recid_rate_by_race = df.groupby('race')[recid_col].mean()\n",
    "    recid_rate_by_race.plot(kind='bar', ax=axes[1], color='coral', alpha=0.7, edgecolor='black')\n",
    "    axes[1].set_title('Recidivism Rate by Race', fontweight='bold')\n",
    "    axes[1].set_ylabel('Recidivism Rate')\n",
    "    axes[1].set_xlabel('Race')\n",
    "    axes[1].tick_params(axis='x', rotation=45)\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## COMPAS Risk Scores Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Decile score distribution\n",
    "print('Decile Score Statistics:')\n",
    "print(df['decile_score'].describe())\n",
    "print(f'\\nValue counts:')\n",
    "print(df['decile_score'].value_counts().sort_index())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize score distribution\n",
    "plt.figure(figsize=(12, 5))\n",
    "plt.subplot(1, 2, 1)\n",
    "df['decile_score'].hist(bins=10, color='steelblue', alpha=0.7, edgecolor='black')\n",
    "plt.title('Decile Score Distribution', fontweight='bold')\n",
    "plt.xlabel('Decile Score')\n",
    "plt.ylabel('Frequency')\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "score_text_counts = df['score_text'].value_counts()\n",
    "score_text_counts.plot(kind='bar', color=['green', 'orange', 'red'], alpha=0.7, edgecolor='black')\n",
    "plt.title('Risk Category Distribution', fontweight='bold')\n",
    "plt.ylabel('Count')\n",
    "plt.xticks(rotation=0)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Score distribution by race\n",
    "print('Average Decile Score by Race:')\n",
    "print(df.groupby('race')['decile_score'].agg(['mean', 'std', 'min', 'max']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Boxplot: score by race\n",
    "plt.figure(figsize=(12, 6))\n",
    "df.boxplot(column='decile_score', by='race', figsize=(12, 6))\n",
    "plt.title('Decile Score Distribution by Race', fontweight='bold')\n",
    "plt.suptitle('')  # Remove automatic title\n",
    "plt.xlabel('Race')\n",
    "plt.ylabel('Decile Score')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Key Insights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('='*70)\n",
    "print('KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS')\n",
    "print('='*70)\n",
    "\n",
    "print('\\n1. DATASET OVERVIEW')\n",
    "print(f'   Total records: {len(df):,}')\n",
    "print(f'   Total features: {len(df.columns)}')\n",
    "\n",
    "print('\\n2. DEMOGRAPHIC DISTRIBUTION')\n",
    "race_pct = df['race'].value_counts() / len(df) * 100\n",
    "for race, pct in race_pct.items():\n",
    "    print(f'   {race}: {pct:.1f}%')\n",
    "\n",
    "if recid_col:\n",
    "    print('\\n3. RECIDIVISM OVERVIEW')\n",
    "    print(f'   Overall recidivism rate: {df[recid_col].mean():.1%}')\n",
    "    \n",
    "    print('\\n4. RECIDIVISM BY RACE')\n",
    "    for race in df['race'].unique():\n",
    "        rate = df[df['race'] == race][recid_col].mean()\n",
    "        print(f'   {race}: {rate:.1%}')\n",
    "\n",
    "print('\\n5. COMPAS SCORES')\n",
    "print(f'   Average decile score: {df[\"decile_score\"].mean():.2f}')\n",
    "print(f'   Score range: {df[\"decile_score\"].min()} to {df[\"decile_score\"].max()}')\n",
    "\n",
    "print('\\n' + '='*70)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}