# Machine Learning Model Training

## Week 5: Build, Train, and Evaluate Your First ML Models

In this notebook, we'll build regression and classification models from scratch.

---

## Setup: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, classification_report

print("Libraries imported successfully!")

## Load & Prepare Data

In [None]:
# Create sample dataset
np.random.seed(42)
n_samples = 100

df = pd.DataFrame({
    'Age': np.random.uniform(22, 65, n_samples),
    'YearsExperience': np.random.uniform(0, 40, n_samples),
    'Education': np.random.choice(['HighSchool', 'Bachelor', 'Master'], n_samples),
    'Department': np.random.choice(['Sales', 'IT', 'HR'], n_samples)
})

# Create target variables
df['Salary'] = 30000 + df['YearsExperience'] * 2000 + df['Age'] * 500 + np.random.normal(0, 5000, n_samples)
df['HighEarner'] = (df['Salary'] > df['Salary'].median()).astype(int)  # Binary classification

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)

## Part 1: REGRESSION - Predicting Salary

### Step 1: Prepare Features

In [None]:
# Select features for regression
X_regression = df[['Age', 'YearsExperience']]
y_regression = df['Salary']

print(f"Features shape: {X_regression.shape}")
print(f"Target shape: {y_regression.shape}")
print(f"\nFeature statistics:")
print(X_regression.describe())

### Step 2: Train-Test Split

In [None]:
# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X_regression, y_regression, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

### Step 3: Scale Features

In [None]:
# Standardize features (important for many algorithms)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Features scaled successfully!")
print(f"Train mean: {X_train_scaled.mean(axis=0):.4f}")

### Step 4: Train Linear Regression Model

In [None]:
# Create and train model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

print("Linear Regression Model Trained!")
print(f"\nModel Coefficients:")
print(f"  Age: {lr_model.coef_[0]:,.0f}")
print(f"  Years Experience: {lr_model.coef_[1]:,.0f}")
print(f"Intercept: {lr_model.intercept_:,.0f}")

### Step 5: Make Predictions

In [None]:
# Predict on test set
y_pred = lr_model.predict(X_test_scaled)

print(f"First 5 predictions: {y_pred[:5]}")
print(f"First 5 actual values: {y_test.values[:5]}")

### Step 6: Evaluate Model

In [None]:
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("="*50)
print("LINEAR REGRESSION EVALUATION")
print("="*50)
print(f"Mean Squared Error (MSE): ${mse:,.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"R² Score: {r2:.4f}")
print(f"\nInterpretation:")
print(f"  - RMSE: On average, predictions are off by ${rmse:,.0f}")
print(f"  - R²: Model explains {r2*100:.1f}% of salary variance")

### Step 7: Visualize Results

In [None]:
# Actual vs Predicted plot
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.title('Actual vs Predicted Salary (Linear Regression)', fontsize=14)
plt.xlabel('Actual Salary ($)')
plt.ylabel('Predicted Salary ($)')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

---

## Part 2: CLASSIFICATION - Predicting High Earners

### Step 1: Prepare Features

In [None]:
# Select features for classification
X_classification = df[['Age', 'YearsExperience']]
y_classification = df['HighEarner']

print(f"Class distribution:")
print(y_classification.value_counts())
print(f"\nClass percentages:")
print(y_classification.value_counts(normalize=True) * 100)

### Step 2: Train-Test Split

In [None]:
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_classification, y_classification, test_size=0.2, random_state=42, stratify=y_classification
)

# Scale features
scaler_clf = StandardScaler()
X_train_clf_scaled = scaler_clf.fit_transform(X_train_clf)
X_test_clf_scaled = scaler_clf.transform(X_test_clf)

print(f"Training set: {len(X_train_clf)} samples")
print(f"Test set: {len(X_test_clf)} samples")

### Step 3: Train Decision Tree

In [None]:
# Create and train decision tree
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train_clf_scaled, y_train_clf)

# Make predictions
y_pred_dt = dt_model.predict(X_test_clf_scaled)

print("Decision Tree Model Trained!")
print(f"Feature Importance:")
for feature, importance in zip(X_classification.columns, dt_model.feature_importances_):
    print(f"  {feature}: {importance:.4f}")

### Step 4: Evaluate Decision Tree

In [None]:
# Calculate metrics
accuracy_dt = accuracy_score(y_test_clf, y_pred_dt)
precision_dt = precision_score(y_test_clf, y_pred_dt)
recall_dt = recall_score(y_test_clf, y_pred_dt)
f1_dt = f1_score(y_test_clf, y_pred_dt)

print("="*50)
print("DECISION TREE EVALUATION")
print("="*50)
print(f"Accuracy: {accuracy_dt:.4f} ({accuracy_dt*100:.1f}%)")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}")
print(f"F1-Score: {f1_dt:.4f}")
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test_clf, y_pred_dt))

### Step 5: Train Random Forest

In [None]:
# Create and train random forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train_clf_scaled, y_train_clf)

# Make predictions
y_pred_rf = rf_model.predict(X_test_clf_scaled)

# Calculate metrics
accuracy_rf = accuracy_score(y_test_clf, y_pred_rf)
precision_rf = precision_score(y_test_clf, y_pred_rf)
recall_rf = recall_score(y_test_clf, y_pred_rf)
f1_rf = f1_score(y_test_clf, y_pred_rf)

print("="*50)
print("RANDOM FOREST EVALUATION")
print("="*50)
print(f"Accuracy: {accuracy_rf:.4f} ({accuracy_rf*100:.1f}%)")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")

### Step 6: Model Comparison

In [None]:
# Compare models
comparison_df = pd.DataFrame({
    'Model': ['Decision Tree', 'Random Forest'],
    'Accuracy': [accuracy_dt, accuracy_rf],
    'Precision': [precision_dt, precision_rf],
    'Recall': [recall_dt, recall_rf],
    'F1-Score': [f1_dt, f1_rf]
})

print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(comparison_df.to_string(index=False))
print(f"\n✓ BEST MODEL: Random Forest" if accuracy_rf > accuracy_dt else f"\n✓ BEST MODEL: Decision Tree")

## Summary & Next Steps

### What We Accomplished:
✅ Built a linear regression model to predict salary
✅ Evaluated regression with R² and RMSE
✅ Built decision tree classifier
✅ Built random forest classifier
✅ Evaluated classification with Accuracy, Precision, Recall, F1
✅ Compared multiple models

### Key Takeaways:
1. Always split data into train and test sets
2. Scale features before training
3. Choose appropriate metrics for your problem
4. Compare multiple models before selecting best
5. Interpret results in business context

### Next Week:
Week 6: MLOps & Model Improvement - Hyperparameter tuning, cross-validation, and deployment prep!