# STEP 1 NumPy array operations:
## Create arrays and compare the performance differences between loop and vectorized operations

In [4]:
import numpy as np

# Create an array from 0 to 9
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# Create a random array
random_arr = np.random.rand(10)  # 10 random numbers between 0 and 1

# Add 5 to each element
arr_plus_5 = arr + 5

# Square each element
arr_squared = arr ** 2

# Multiplication of two array elements
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
product = arr1 * arr2  # result [4, 10, 18]

import time

# Use Python Loops
start = time.time()
result_loop = [x + 5 for x in range(1000000)]
loop_time = time.time() - start

# Use vectorization operations
start = time.time()
arr = np.arange(1000000)
result_vec = arr + 5
vec_time = time.time() - start

print(f"Loop time consumption: {loop_time:.5f} seconds")
print(f"Vectorization time consumption: {vec_time:.5f} seconds")

Loop time consumption: 0.05253 seconds
Vectorization time consumption: 0.00400 seconds


# STEP 2 Load the dataset: 
## Load the CSV file and initially check the data

In [1]:
import pandas as pd

# Load the CSV file (assuming the file is in the data directory at the same level as the notebook)
df = pd.read_csv('C:/Users/Aislay/bootcamp_Ziyi_Yang/homework/stage03_python-fundamentals/data/starter_data.csv')

# View basic data information
df.info()

# Display the first 5 lines
df.head()

# Check for Missing Values
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  10 non-null     object
 1   value     10 non-null     int64 
 2   date      10 non-null     object
dtypes: int64(1), object(2)
memory usage: 372.0+ bytes


category    0
value       0
date        0
dtype: int64

# STEP 3 Computational statistics: 
## Calculate descriptive statistics and group aggregation

In [2]:
# Numerical Column Statistics
def get_summary_stats(df):
    """Obtain the statistical summary of the data frame"""
    numeric_stats = df.describe(include='number')
    object_stats = df.describe(include='object')
    return pd.concat([numeric_stats, object_stats], axis=1)

stats = get_summary_stats(df)
print("=== Overall statistics ===")
print(stats)

# Group Statistics
if 'category' in df.columns:
    print("\n=== Group and statistically analyze by category ===")
    grouped_stats = df.groupby('category').agg({
        'value': ['mean', 'median', 'std', 'count'],
        'date': ['min', 'max']
    })
    print(grouped_stats)

=== Overall statistics ===
            value category        date
count   10.000000       10          10
mean    17.600000      NaN         NaN
std      7.381659      NaN         NaN
min     10.000000      NaN         NaN
25%     12.250000      NaN         NaN
50%     14.500000      NaN         NaN
75%     23.250000      NaN         NaN
max     30.000000      NaN         NaN
unique        NaN        3          10
top           NaN        A  2025-08-01
freq          NaN        4           1

=== Group and statistically analyze by category ===
              value                               date            
               mean median       std count         min         max
category                                                          
A         11.500000   11.5  1.290994     4  2025-08-01  2025-08-10
B         15.666667   15.0  2.081666     3  2025-08-02  2025-08-08
C         27.666667   28.0  2.516611     3  2025-08-05  2025-08-09


# STEP 4 Save numerical result: 
## Save the statistical results and visual charts

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os

import pandas as pd
from pathlib import Path

def save_stats_to_csv(stats, grouped_stats, filename="data/processed/summary.csv"):
    """
    Merge the statistical results and save them as a CSV file
    
    Parameter
        Overall statistics generated by stats: describe()
        grouped_stats: Grouping statistics generated by groupby()
        filename: Output file path
    """
    try:
        # Create an output directory
        output_path = Path(filename)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Convert Group Statistics Format (Multi-level Index to Single-level)
        grouped_flat = grouped_stats.copy()
        grouped_flat.columns = ['_'.join(col).strip() for col in grouped_flat.columns.values]
        grouped_flat = grouped_flat.reset_index()
        
        # Merge the statistical results
        combined_stats = pd.concat([
            stats.reset_index().rename(columns={'index': 'metric'}),
            pd.DataFrame([['---', '---', '---']], columns=['metric', 'value', 'category']), 
            grouped_flat
        ])
        
        # Save to CSV
        combined_stats.to_csv(output_path, index=False)
        print(f"The statistical results have been saved to: {output_path}")
        
        # Print the preview of the saved content
        print("\nSave content preview:")
        print(combined_stats.head(8))  
        
    except Exception as e:
        print(f"Save failed: {str(e)}")

# use sample -- -- -- -- -- -- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Suppose you already have your statistical results 
stats = pd.DataFrame({
    'value': [10.0, 17.6, 7.38, 10.0, 12.25, 14.5, 23.25, 30.0, None, None],
    'category': [None]*10,
    'date': [None]*10
}, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'unique', 'top'])

grouped_stats = pd.DataFrame({
    ('value', 'mean'): [11.5, 15.67, 27.67],
    ('value', 'median'): [11.5, 15.0, 28.0],
    ('value', 'std'): [1.29, 2.08, 2.52],
    ('value', 'count'): [4, 3, 3],
    ('date', 'min'): ['2025-08-01', '2025-08-02', '2025-08-05'],
    ('date', 'max'): ['2025-08-10', '2025-08-08', '2025-08-09']
}, index=pd.Index(['A', 'B', 'C'], name='category'))

# Call the save function
save_stats_to_csv(stats, grouped_stats)

def save_basic_plots(df, save_dir='data/processed'):
    """
    Draw and save the basic statistical charts

    Parameter
        df: DataFrame containing data
        save_dir: Output directory path
    """
    # Make sure the directory exists
    output_path = Path(save_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    plt.figure(figsize=(12, 5))
    
    # Histogram of Numerical Distribution
    df['value'].plot(kind='hist', 
                    bins=8, 
                    color='skyblue', 
                    edgecolor='black',
                    alpha=0.7)
    plt.title('Value Distribution\n(All Categories)')
    plt.xlabel('Value Range')
    plt.ylabel('Frequency')
    
    # Adjust the layout and save
    plt.tight_layout()
    plot_path = output_path / 'basic_plots.png'
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"the plot has been saved to: {plot_path}")
    return plot_path

# use sample -- -- -- -- -- -- -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Suppose there is already a DataFrame (using the data sample provided)
data = {
    'category': ['A', 'B', 'A', 'B', 'C', 'C', 'A', 'B', 'C', 'A'],
    'value': [10, 15, 12, 18, 25, 30, 11, 14, 28, 13],
    'date': ['2025/8/1', '2025/8/2', '2025/8/3', '2025/8/4', 
             '2025/8/5', '2025/8/6', '2025/8/7', '2025/8/8',
             '2025/8/9', '2025/8/10']
}
df = pd.DataFrame(data)

# Call the drawing function
chart_path = save_basic_plots(df)

The statistical results have been saved to: data\processed\summary.csv

Save content preview:
  metric  value category  date  value_mean  value_median  value_std  \
0  count   10.0     None  None         NaN           NaN        NaN   
1   mean   17.6     None  None         NaN           NaN        NaN   
2    std   7.38     None  None         NaN           NaN        NaN   
3    min   10.0     None  None         NaN           NaN        NaN   
4    25%  12.25     None  None         NaN           NaN        NaN   
5    50%   14.5     None  None         NaN           NaN        NaN   
6    75%  23.25     None  None         NaN           NaN        NaN   
7    max   30.0     None  None         NaN           NaN        NaN   

   value_count date_min date_max  
0          NaN      NaN      NaN  
1          NaN      NaN      NaN  
2          NaN      NaN      NaN  
3          NaN      NaN      NaN  
4          NaN      NaN      NaN  
5          NaN      NaN      NaN  
6          NaN      N