In [23]:
# HW03 - Python Fundamentals
import sys, os
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
from pathlib import Path
from src.config import data_dir
import time

In [25]:
# create arrays
x = np.arange(100000)

# elementwise operations
sum = x + 1
product = x * 2
normal_dis = np.sqrt(x**2)

# Compare loop vs vectorized execution
import time
def timeit(fn):
    t0 = time.perf_counter()
    out = fn()
    t1 = time.perf_counter()
    return out, (t1 - t0)

# loop sum
def loop_sum():
    s = 0
    for i in range(len(x)):
        s += x[i]
    return s

# vectorized sum
def vec_sum():
    return np.sum(x)

time_loop = timeit(loop_sum)[1]
time_vectorized  = timeit(vec_sum)[1]
print(f"Loop: {time_loop:.4f}s  |  Vectorized: {time_vectorized:.4f}s")


Loop: 0.0196s  |  Vectorized: 0.0001s


In [47]:
#Dataset Loading
raw_data_path = Path("data/starter_data.csv")
raw_data_path.parent.mkdir(parents=True, exist_ok=True)

if not raw_data_path.exists():
    # generate a small starter dataset with a category column
    rng = np.random.default_rng(20)
    n = 100
    df_gen = pd.DataFrame({
        "id": np.arange(1, n+1),
        "category": rng.choice(list("ABCDE"), size=n, p=[.2,.2,.2,.2,.2]),
        "value": rng.normal(loc=20, scale=5, size=n).round(2),
        "amount": rng.lognormal(mean=10, sigma=1, size=n).round(0)
    })
    df_gen.to_csv(raw_data_path, index=False)

df = pd.read_csv(raw_data_path)
print(raw_data_path)
df.info()
df.head()


data/starter_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        100 non-null    int64  
 1   category  100 non-null    object 
 2   value     100 non-null    float64
 3   amount    100 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 3.3+ KB


Unnamed: 0,id,category,value,amount
0,1,B,24.44,32245.0
1,2,C,32.35,7292.0
2,3,A,16.66,13583.0
3,4,C,22.45,86464.0
4,5,C,19.42,3437.0


In [29]:
#Summary Statistics
# describe()
summary = df.select_dtypes(include="number").describe().T
summary

# groupby()
grouped = (
    df.groupby("category", as_index=False)
      .agg(count=("id","count"),
           value_mean=("value","mean"),
           value_std=("value","std"),
           amount_sum=("amount","sum"))
      .sort_values("count", ascending=False)
)
grouped

Unnamed: 0,category,count,value_mean,value_std,amount_sum
1,B,26,19.968462,5.000289,523991.0
0,A,20,18.041,4.312683,595274.0
2,C,20,20.215,6.416221,699249.0
3,D,18,19.131111,4.189408,586308.0
4,E,16,18.566875,3.880211,376302.0


In [49]:
# Save Outputs
processed_direction = Path("data/processed")
processed_direction.mkdir(parents=True, exist_ok=True)

summary_output = processed_direction / "summary.csv"
grouped_output = processed_direction / "grouped_by_category.csv"
plot_output    = processed_direction / "category_counts_plot.png"

summary.to_csv(summary_output)
grouped.to_csv(grouped_output, index=False)

# simple plot: category counts bar chart
ax = df["category"].value_counts().sort_index().plot(kind="bar", title="Category counts")
ax.figure.tight_layout()
ax.figure.savefig(plot_output)
ax.figure.clf()

summary_output, grouped_output, plot_output

(PosixPath('data/processed/summary.csv'),
 PosixPath('data/processed/grouped_by_category.csv'),
 PosixPath('data/processed/category_counts_plot.png'))

<Figure size 640x480 with 0 Axes>

In [51]:
from src.utils import get_summary_stats
get_summary_stats(df).head()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,100.0,50.5,29.011492,1.0,25.75,50.5,75.25,100.0
value,100.0,19.2573,4.876093,8.16,15.58,19.185,22.7475,32.35
amount,100.0,27811.24,27765.573384,1866.0,11213.5,19478.5,32601.0,174302.0
