In [2]:
# ============================================================================
# AISOC ML WORKFLOW TUTORIAL - COMPLETE CODE
# Machine Learning with Python & Scikit-Learn - 50 Minutes
# ============================================================================

# Pre-Class Setup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os

print("🚀 AISOC ML WORKFLOW TUTORIAL")
print("=" * 50)

# ----------------------------------------------------------------------------
# SLIDE 1: WELCOME & OVERVIEW (6 minutes)
# ----------------------------------------------------------------------------
print("\n=== SLIDE 1: WELCOME & OVERVIEW ===")

# Load Amazon sales data from Downloads folder
username = os.getlogin()
file_path = f'C:\\Users\\{username}\\Downloads\\amazon_sale_report.csv'

try:
    sales_data = pd.read_csv(file_path)
    print(f"✅ Today's mission: {sales_data.shape[0]} sales records")
    print("\n📊 First look at our data:")
    print(sales_data.head())
    print(f"\n🔍 Column names: {list(sales_data.columns)}")
except FileNotFoundError:
    print("❌ amazon_sale_report.csv not found in Downloads folder")
    print("Creating sample data for demonstration...")
    # Create sample data
    dates = pd.date_range('2023-01-01', '2023-12-31', freq='D')[:100]
    categories = ['Electronics', 'Books', 'Clothing', 'Home', 'Sports']
    sales_data = pd.DataFrame({
        'Date': dates,
        'Category': np.random.choice(categories, 100),
        'Amount': np.random.normal(1000, 300, 100),  # Using 'Amount' instead of 'Sales'
        'Units': np.random.randint(1, 50, 100)
    })
    print(f"✅ Sample data created: {len(sales_data)} records")

# ----------------------------------------------------------------------------
# SLIDE 2: THE ML WORKFLOW UNIVERSE (6 minutes)
# ----------------------------------------------------------------------------
print("\n=== SLIDE 2: THE ML WORKFLOW UNIVERSE ===")
print("=== THE THREE PILLARS ===")
print(f"1. DATA: {len(sales_data)} sales records")
print(f"2. MODEL: Coming soon!")
print(f"3. CODE: The system that connects everything")

print("\n📋 Three Phases:")
print("• Phase 1: Data Engineering (60-80% of time)")
print("• Phase 2: Model Engineering (15-25% of time)")
print("• Phase 3: Code Engineering (10-20% of time)")

# ----------------------------------------------------------------------------
# SLIDE 3: PHASE 1 - DATA ENGINEERING (8 minutes)
# ----------------------------------------------------------------------------
print("\n=== SLIDE 3: PHASE 1 - DATA ENGINEERING ===")

# Step 1: Explore the data
print("\n🔍 What do we have?")
print(f"Shape: {sales_data.shape}")
print(f"Columns: {list(sales_data.columns)}")

# Step 2: Check for problems
print(f"\n🔍 Data Quality Check:")
print(f"Missing values: {sales_data.isnull().sum().sum()}")
print(f"Duplicates: {sales_data.duplicated().sum()}")

# Step 3: Basic cleaning
# Find date column
date_col = None
for col in sales_data.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        date_col = col
        break

if date_col:
    sales_data[date_col] = pd.to_datetime(sales_data[date_col])
    print(f"✅ Converted '{date_col}' to datetime")

sales_data = sales_data.dropna()  # Remove missing values
print("✅ Data cleaned and ready!")

# ----------------------------------------------------------------------------
# SLIDE 4: FEATURE ENGINEERING - THE SECRET SAUCE (8 minutes)
# ----------------------------------------------------------------------------
print("\n=== SLIDE 4: FEATURE ENGINEERING ===")

# Find target column (sales/amount/revenue)
target_col = None
possible_targets = ['Sales', 'Amount', 'Revenue', 'Total', 'sales', 'amount', 'revenue', 'total']
for col in possible_targets:
    if col in sales_data.columns:
        target_col = col
        break

if target_col is None:
    # Use first numeric column as target
    numeric_cols = sales_data.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        target_col = numeric_cols[0]
        print(f"⚠️ Using '{target_col}' as target variable")

print(f"✅ Target variable: '{target_col}'")

# Create business-smart features if we have a date column
if date_col:
    sales_data['Month'] = sales_data[date_col].dt.month
    sales_data['Is_Holiday_Season'] = sales_data['Month'].isin([11, 12]).astype(int)
    sales_data['Is_Weekend'] = sales_data[date_col].dt.dayofweek.isin([5, 6]).astype(int)
    print("✅ Created time-based features")

# Encode categories
categorical_cols = sales_data.select_dtypes(include=['object']).columns
categorical_cols = [col for col in categorical_cols if col != date_col]

for col in categorical_cols:
    if sales_data[col].nunique() <= 20:  # Only if reasonable number of categories
        sales_data = pd.get_dummies(sales_data, columns=[col], drop_first=True)
        print(f"✅ Encoded '{col}' categories")

print(f"After engineering: {len(sales_data.columns)} features")
print("✅ Features ready for modeling!")

# ----------------------------------------------------------------------------
# SLIDE 5: MODEL SELECTION & TRAINING (8 minutes)
# ----------------------------------------------------------------------------
print("\n=== SLIDE 5: MODEL SELECTION & TRAINING ===")

# Prepare data - drop target and date columns
columns_to_drop = [target_col]
if date_col and date_col in sales_data.columns:
    columns_to_drop.append(date_col)

X = sales_data.drop(columns_to_drop, axis=1)
y = sales_data[target_col]

# Keep only numeric columns for modeling
X = X.select_dtypes(include=[np.number])

print(f"Features for modeling: {list(X.columns)}")
print(f"Training on {len(X)} samples with {len(X.columns)} features")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Simple model first
simple_model = LinearRegression()
simple_model.fit(X_train, y_train)
print("✅ Simple model trained")

# Then complex model
complex_model = RandomForestRegressor(n_estimators=100, random_state=42)
complex_model.fit(X_train, y_train)
print("✅ Complex model trained")

# ----------------------------------------------------------------------------
# SLIDE 6: MODEL EVALUATION & PERFORMANCE (8 minutes)
# ----------------------------------------------------------------------------
print("\n=== SLIDE 6: MODEL EVALUATION & PERFORMANCE ===")

# Make predictions
simple_pred = simple_model.predict(X_test)
complex_pred = complex_model.predict(X_test)

# Calculate performance
simple_rmse = np.sqrt(mean_squared_error(y_test, simple_pred))
complex_rmse = np.sqrt(mean_squared_error(y_test, complex_pred))

simple_r2 = r2_score(y_test, simple_pred)
complex_r2 = r2_score(y_test, complex_pred)

print("=== PERFORMANCE COMPARISON ===")
print(f"Simple Model - RMSE: ${simple_rmse:.2f}, R²: {simple_r2:.3f}")
print(f"Complex Model - RMSE: ${complex_rmse:.2f}, R²: {complex_r2:.3f}")

# Declare winner
if complex_rmse < simple_rmse:
    print("🏆 Winner: Complex Model")
    best_model = complex_model
    best_rmse = complex_rmse
else:
    print("🏆 Winner: Simple Model")
    best_model = simple_model
    best_rmse = simple_rmse

# ----------------------------------------------------------------------------
# SLIDE 7: PRODUCTION DEPLOYMENT & MONITORING (8 minutes)
# ----------------------------------------------------------------------------
print("\n=== SLIDE 7: PRODUCTION DEPLOYMENT & MONITORING ===")

# Save the model
joblib.dump(best_model, 'sales_model.pkl')
print("✅ Model saved for production")

# Simulate production use
loaded_model = joblib.load('sales_model.pkl')
sample_data = X_test.iloc[:3]
predictions = loaded_model.predict(sample_data)

print("Production predictions:")
for i, pred in enumerate(predictions):
    print(f"  Customer {i+1}: ${pred:.2f}")

# Simple monitoring
def check_performance(new_rmse, baseline_rmse):
    if new_rmse > baseline_rmse * 1.15:  # 15% worse
        print("⚠️ Performance Alert! Retrain needed")
    else:
        print("✅ Model performing well")

check_performance(simple_rmse, best_rmse)

# ----------------------------------------------------------------------------
# SLIDE 8: BEST PRACTICES & NEXT STEPS (6 minutes)
# ----------------------------------------------------------------------------
print("\n=== SLIDE 8: BEST PRACTICES & NEXT STEPS ===")
print("=== THREE GOLDEN RULES ===")
print("🥇 Rule 1: Make Sure You Have the Data")
print("   - Quality over quantity")
print("   - Does data contain the signal?")

print("🥈 Rule 2: Start Simple, Add Complexity")
print("   - Always establish baseline first")
print("   - Simple often surprises you")

print("🥉 Rule 3: Document Everything")
print("   - Test data quality")
print("   - Track decisions and trade-offs")

print(f"\n🎯 Today's Success:")
print(f"   📊 Processed {len(sales_data)} records")
print(f"   🤖 Built working ML model")
print(f"   🚀 Ready for production!")

print("\n🌟 Thank you for an amazing session!")
print("Now go build something incredible! 🚀")

# ----------------------------------------------------------------------------
# STUDENT TAKEAWAY CODE
# ----------------------------------------------------------------------------
print("\n" + "=" * 50)
print("STUDENT TAKEAWAY CODE")
print("=" * 50)

takeaway_code = f'''
# Complete ML Workflow - Student Version
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 1. Load data
data = pd.read_csv(r'C:\\Users\\YourName\\Downloads\\amazon_sale_report.csv')

# 2. Feature engineering
if 'Date' in data.columns:
    data['Month'] = pd.to_datetime(data['Date']).dt.month
    data['Is_Holiday'] = data['Month'].isin([11, 12]).astype(int)

# Encode categories
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'Date' and data[col].nunique() <= 20:
        data = pd.get_dummies(data, columns=[col], drop_first=True)

# 3. Train models
target_col = '{target_col}'  # Adjust based on your data
X = data.drop([target_col, 'Date'], axis=1, errors='ignore')
X = X.select_dtypes(include=[np.number])
y = data[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)

# 4. Evaluate
predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Model RMSE: ${{rmse:.2f}}")

# 5. Save for production
joblib.dump(model, 'my_sales_model.pkl')
'''

print(takeaway_code)
print("=" * 50)
print("🎉 TUTORIAL COMPLETE!")

🚀 AISOC ML WORKFLOW TUTORIAL

=== SLIDE 1: WELCOME & OVERVIEW ===


  sales_data = pd.read_csv(file_path)


✅ Today's mission: 128975 sales records

📊 First look at our data:
   index             Order ID      Date                        Status  \
0      0  405-8078784-5731545  04-30-22                     Cancelled   
1      1  171-9198151-1101146  04-30-22  Shipped - Delivered to Buyer   
2      2  404-0687676-7273146  04-30-22                       Shipped   
3      3  403-9615377-8133951  04-30-22                     Cancelled   
4      4  407-1069790-7240320  04-30-22                       Shipped   

  Fulfilment Sales Channel  ship-service-level    Style              SKU  \
0   Merchant      Amazon.in           Standard   SET389   SET389-KR-NP-S   
1   Merchant      Amazon.in           Standard  JNE3781  JNE3781-KR-XXXL   
2     Amazon      Amazon.in          Expedited  JNE3371    JNE3371-KR-XL   
3   Merchant      Amazon.in           Standard    J0341       J0341-DR-L   
4     Amazon      Amazon.in          Expedited  JNE3671  JNE3671-TU-XXXL   

        Category  ... currency  Amoun