# Downloading and Analyzing the Dataset with Visualizations

## Imports

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from importlib.metadata import version
import logging, os
from pathlib import Path

In [2]:
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
packages = ["pandas", "seaborn", "matplotlib", "importlib-metadata"]
for package in packages:
    try:
        logger.info(f"{package} version: {version(package)}")
    except Exception as e:
        logger.warning(f"Could not get version for package {package}: {e}")

INFO:__main__:pandas version: 2.3.1
INFO:__main__:seaborn version: 0.13.2
INFO:__main__:matplotlib version: 3.10.5
INFO:__main__:importlib-metadata version: 8.7.0


## Download the data from huggingface to a dataframe
 - Download data
 - Save the data to local directory to store the raw data file

In [None]:
DATA_ROOT = Path("../Data")
RAW_DATA_DIR_NAME = "Data"
DATA_RAW_FILE_NAME = "ENTER DATA FILE NAME (e.g., data.csv)"

DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_RAW_FILE_NAME

In [None]:
DATASET_URL = "ENTER DATASET URL (e.g., https://example.com/data.csv)"

In [45]:
df = pd.read_csv(DATASET_URL)

In [None]:
os.makedirs(DATA_ROOT / RAW_DATA_DIR_NAME, exist_ok=True)   # Create directory if it doesn't exist

In [None]:
df.to_csv(DATA_PATH, index=False)   # Save the data to local directory to store the raw data file

## Data Exploration

In [None]:
df.head()

In [None]:
df.size # amount of data points

In [None]:
df.shape

In [None]:
df.isnull().sum()

## Interpreting the Columns:

 - feature 1: Description of feature 1.  (DATA TYPE)
 - feature 2: Description of feature 2. (DATA TYPE)

## Visualizing Data

### Countplots

In [None]:
plt.figure(figsize=(12, 6))
# sns.countplot(x='age', data=df) # alternative way
sns.countplot(x=df['age'])  # more direct
plt.title('Age Count Plot')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x=df['age'], hue=df['smoker'])
plt.title('Age Count Plot with Smoker Hue')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='region', hue='smoker')
plt.title('Region Count Plot with Region Hue')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='children', hue='smoker')
plt.title(label='Children Count Plot with Smoker Hue')
plt.show()

### ScatterPlots

In [None]:
sns.scatterplot(data=df, x='age', y='charges', hue='sex')

In [None]:
sns.scatterplot(data=df, x='bmi', y='charges', hue='smoker')

In [None]:
sns.scatterplot(data=df, x='age', y='bmi', hue='sex')

In [None]:
sns.scatterplot(data=df, x='age', y='children', hue='smoker')

### Distribution Plots

In [None]:
sns.displot(df['bmi'], kde=True)

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=df, x='age', bins=30, kde=True)
plt.title('Age Distribution Plot')
plt.show()

### Jointplots

In [None]:
sns.jointplot(data=df, x='age', y='charges', hue='smoker')

Describe

In [None]:
df.describe()

### End