# 🧪 Exploratory Data Analysis: Gastric Cancer Dataset

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np
import os

# Load dataset
DATA_PATH = '../data/gastric_big.csv'
df = pd.read_csv(DATA_PATH)
df.head()


## 📊 Basic Info & Statistics


In [None]:
df.info()


In [None]:
df.describe()


## 📌 Class Distribution


In [None]:
sns.countplot(data=df, x="Diagnosis", palette="Set2")
plt.title("Class Distribution")
plt.ylabel("Count")
plt.xlabel("Diagnosis")
plt.show()


## 🧬 Feature Distributions


In [None]:
features = df.columns[:-1]  # exclude Diagnosis
df[features].hist(figsize=(16, 10), bins=30, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()


## 🔍 Correlation Heatmap


In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


## 📦 Boxplot per Feature vs. Diagnosis


In [None]:
for feature in features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=df, x="Diagnosis", y=feature, palette="Set3")
    plt.title(f"{feature} vs. Diagnosis")
    plt.show()


## ✨ Interactive Plot: Age vs. Glucose


In [None]:
fig = px.scatter(
    df.sample(5000), x="Age", y="Glucose", color="Diagnosis",
    title="Age vs Glucose by Diagnosis",
    opacity=0.6
)
fig.show()
