In [1]:
"""
Homework Sheet — Stage 03: Python Fundamentals
This script implements all required steps in ONE place.
- NumPy operations
- Data loading & inspection
- Summary statistics & groupby
- Save outputs
- Utility function (in src/utils.py)
- Bonus: basic plot saved to data/processed/
"""

import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# --- Ensure folders exist ---
os.makedirs("data/processed", exist_ok=True)
os.makedirs("src", exist_ok=True)

# =========================
# 1. NumPy Operations
# =========================
print("=== NumPy Operations ===")
arr = np.arange(1, 1_000_001)  # 1 to 1,000,000

# Elementwise operation (vectorized)
start = time.time()
squared_vec = arr ** 2
end = time.time()
print(f"Vectorized squaring took {end - start:.4f} sec")

# Elementwise operation (loop)
start = time.time()
squared_loop = [x ** 2 for x in arr]
end = time.time()
print(f"Loop squaring took {end - start:.4f} sec\n")

# =========================
# 2. Dataset Loading
# =========================
print("=== Dataset Loading ===")
data_path = "data/starter_data.csv"
if not os.path.exists(data_path):
    # Create dummy dataset if file not present
    df_dummy = pd.DataFrame({
        "Category": ["A", "B", "A", "B", "C", "A", "C"],
        "Value1": [10, 20, 30, 40, 50, 60, 70],
        "Value2": [5, 15, 25, 35, 45, 55, 65]
    })
    df_dummy.to_csv(data_path, index=False)
    print(f"No dataset found → created dummy CSV at {data_path}")

df = pd.read_csv(data_path)
print(df.info())
print(df.head(), "\n")

# =========================
# 3. Summary Statistics
# =========================
print("=== Summary Statistics ===")
summary = df.describe(include="all")
print(summary, "\n")

print("=== Groupby Aggregation (by Category) ===")
grouped = df.groupby("Category").agg({"Value1": "mean", "Value2": "sum"})
print(grouped, "\n")

# =========================
# 4. Save Outputs
# =========================
summary_out = "data/processed/summary.csv"
summary.to_csv(summary_out)
print(f"Summary stats saved to {summary_out}")

grouped_out = "data/processed/grouped.csv"
grouped.to_csv(grouped_out)
print(f"Groupby stats saved to {grouped_out}")

# Bonus: simple plot
plt.figure(figsize=(6, 4))
df.groupby("Category")["Value1"].mean().plot(kind="bar", color="skyblue")
plt.title("Average Value1 by Category")
plt.ylabel("Mean of Value1")
plt.tight_layout()
plot_path = "data/processed/plot.png"
plt.savefig(plot_path)
plt.close()
print(f"Plot saved to {plot_path}\n")

# =========================
# 5. Reusable Function
# =========================
utils_code = '''\
import pandas as pd

def get_summary_stats(df: pd.DataFrame, group_col: str = None):
    """
    Return summary statistics and optional groupby aggregation.
    """
    summary = df.describe(include="all")
    grouped = None
    if group_col and group_col in df.columns:
        grouped = df.groupby(group_col).agg("mean")
    return summary, grouped
'''

with open("src/utils.py", "w") as f:
    f.write(utils_code)

print("✅ src/utils.py created with reusable get_summary_stats() function")

# Import & test reusable function
from src.utils import get_summary_stats
summary2, grouped2 = get_summary_stats(df, group_col="Category")
print("Reusable function output (summary):")
print(summary2.head())
print("\nReusable function output (grouped):")
print(grouped2)

=== NumPy Operations ===
Vectorized squaring took 0.0011 sec
Loop squaring took 0.0988 sec

=== Dataset Loading ===
No dataset found → created dummy CSV at data/starter_data.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  7 non-null      object
 1   Value1    7 non-null      int64 
 2   Value2    7 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 300.0+ bytes
None
  Category  Value1  Value2
0        A      10       5
1        B      20      15
2        A      30      25
3        B      40      35
4        C      50      45 

=== Summary Statistics ===
       Category     Value1     Value2
count         7   7.000000   7.000000
unique        3        NaN        NaN
top           A        NaN        NaN
freq          3        NaN        NaN
mean        NaN  40.000000  35.000000
std         NaN  21.602469  21.602469
min         NaN