# 1. Import Libraries
This notebook provides a generic template for data analysis and modeling. Adjust steps as needed for your specific dataset and task (regression or classification).

In [None]:
# Core libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report, mean_squared_error, r2_score)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import joblib

# Display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 120)

# Plot style
sns.set_theme(context='notebook', style='whitegrid')

print('Libraries imported.')

# 2. Load Dataset
Load a dataset from CSV (or adapt to read from a database / API). Update `DATA_PATH` as needed.

In [None]:
# Update this path or replace with DB read logic
DATA_PATH = os.getenv('DATA_PATH', 'data_extracts/stocks/stocks_sample.csv')

if not os.path.exists(DATA_PATH):
    print(f"WARNING: {DATA_PATH} not found. Provide a valid CSV path.")
    df = pd.DataFrame()
else:
    df = pd.read_csv(DATA_PATH)
    print('Loaded dataset:', DATA_PATH)

print('Shape:', df.shape)
df.head()

# 3. Inspect Raw Data
Basic structure, dtypes, and initial statistics.

In [None]:
if df.empty:
    print('DataFrame empty; skip inspection.')
else:
    display(df.head())
    print('\nDtypes:')
    print(df.dtypes)
    print('\nDescribe (numeric):')
    display(df.describe().T)
    # Categorical quick value counts (top 10)
    cat_cols = [c for c in df.columns if df[c].dtype == 'object' and df[c].nunique() < 50]
    for c in cat_cols:
        print(f'\nValue counts for {c}:')
        print(df[c].value_counts().head(10))

# 4. Data Cleaning
Handle missing values, duplicates, and simple outlier flagging.

In [None]:
if df.empty:
    print('Skip cleaning (empty df).')
else:
    # Example: drop duplicate rows
    before = len(df)
    df = df.drop_duplicates()
    print(f'Removed {before - len(df)} duplicate rows.')

    # Example: simple missing value handling
    missing_ratio = df.isna().mean()
    cols_drop = [c for c,r in missing_ratio.items() if r > 0.4]
    if cols_drop:
        print('Dropping high-missing columns:', cols_drop)
        df = df.drop(columns=cols_drop)
    # Fill remaining numeric NaNs with median
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # Quick outlier flag (z-score > 4) for numeric columns
    from scipy import stats
    zscores = np.abs(stats.zscore(df[num_cols], nan_policy='omit'))
    if isinstance(zscores, np.ndarray):
        extreme_mask = (zscores > 4).any(axis=1)
        print('Extreme outlier rows:', extreme_mask.sum())

    print('Post-cleaning shape:', df.shape)

# 5. Feature Engineering
Create/encode features and prepare data for modeling.

In [None]:
target_column = None  # Set to your target column name if supervised learning

if df.empty:
    print('Skip feature engineering (empty df).')
else:
    # Example: create simple numeric ratios if price columns present
    if {'high','low','close'}.issubset(df.columns):
        df['hl_range'] = df['high'] - df['low']
        df['close_to_high_pct'] = (df['high'] - df['close']) / df['high'] * 100

    # Identify categorical and numeric columns
    categorical_cols = [c for c in df.columns if df[c].dtype == 'object']
    numeric_cols = [c for c in df.columns if c not in categorical_cols]
    if target_column and target_column in categorical_cols:
        categorical_cols.remove(target_column)
    if target_column and target_column in numeric_cols:
        numeric_cols.remove(target_column)

    print('Categorical:', categorical_cols)
    print('Numeric:', numeric_cols[:10], '...')


# 6. Train / Test Split
We detect if the task looks like classification (few unique values) or regression and split accordingly.

In [None]:
from sklearn.model_selection import train_test_split

is_classification = False
if target_column and target_column in df.columns:
    target_series = df[target_column]
    # Heuristic: classification if <= 15 unique values and not purely numeric continuous spread
    if target_series.nunique() <= 15:
        is_classification = True
else:
    print('No target_column set; subsequent modeling sections will be skipped unless you define one.')

if target_column:
    feature_cols = [c for c in df.columns if c != target_column]
    X = df[feature_cols].select_dtypes(include=['number']).fillna(0)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=not bool(df.index.is_monotonic_increasing), random_state=42)
    print('Train shape:', X_train.shape, 'Test shape:', X_test.shape, 'Classification?' , is_classification)


# 7. Baseline Model
Simple baseline (LinearRegression or LogisticRegression).

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error
import numpy as np

baseline_model = None
baseline_metrics = {}

if target_column and target_column in df.columns:
    if is_classification:
        # Basic encoding for classification (ensure y is categorical/integer)
        y_enc = y
        if y.dtype == 'object':
            y_enc = y.astype('category').cat.codes
        baseline_model = LogisticRegression(max_iter=500)
        baseline_model.fit(X_train, y_train if y.dtype != 'object' else y_enc.iloc[y_train.index])
        y_pred = baseline_model.predict(X_test)
        baseline_metrics['accuracy'] = accuracy_score(y_test if y.dtype != 'object' else y_enc.iloc[y_test.index], y_pred)
        baseline_metrics['f1_macro'] = f1_score(y_test if y.dtype != 'object' else y_enc.iloc[y_test.index], y_pred, average='macro')
    else:
        baseline_model = LinearRegression()
        baseline_model.fit(X_train, y_train)
        y_pred = baseline_model.predict(X_test)
        baseline_metrics['mae'] = mean_absolute_error(y_test, y_pred)
        baseline_metrics['rmse'] = mean_squared_error(y_test, y_pred, squared=False)

print('Baseline metrics:', baseline_metrics if baseline_metrics else 'No target / modeling skipped')


# 8. Advanced Model & Hyperparameter Tuning
Grid search over RandomForest for improved performance.

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV

gs_best = None
gs_metrics = {}

if target_column and target_column in df.columns:
    if is_classification:
        model = RandomForestClassifier(random_state=42, n_jobs=-1)
        param_grid = {'n_estimators': [50, 100], 'max_depth': [5, None]}
    else:
        model = RandomForestRegressor(random_state=42, n_jobs=-1)
        param_grid = {'n_estimators': [100, 200], 'max_depth': [5, None]}

    grid = GridSearchCV(model, param_grid, cv=3, scoring='accuracy' if is_classification else 'neg_root_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    gs_best = grid.best_estimator_
    y_pred = gs_best.predict(X_test)
    if is_classification:
        gs_metrics['accuracy'] = accuracy_score(y_test, y_pred)
        gs_metrics['f1_macro'] = f1_score(y_test, y_pred, average='macro')
    else:
        gs_metrics['rmse'] = mean_squared_error(y_test, y_pred, squared=False)
        gs_metrics['mae'] = mean_absolute_error(y_test, y_pred)

print('GridSearch best params:' , (grid.best_params_ if gs_metrics else 'N/A'))
print('GridSearch metrics:', gs_metrics if gs_metrics else 'Skipped')


# 9. Visualization of Results
Feature importances, residuals (regression) or confusion matrix (classification).

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

if gs_best is not None:
    if is_classification:
        y_pred = gs_best.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(5,4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.show()
    else:
        y_pred = gs_best.predict(X_test)
        residuals = y_test - y_pred
        plt.figure(figsize=(5,4))
        sns.scatterplot(x=y_pred, y=residuals, s=20)
        plt.axhline(0, color='red', linestyle='--')
        plt.xlabel('Predicted')
        plt.ylabel('Residuals')
        plt.title('Residual Plot')
        plt.show()

    # Feature importance if available
    if hasattr(gs_best, 'feature_importances_'):
        importances = gs_best.feature_importances_
        fi_df = (pd.DataFrame({'feature': X_train.columns, 'importance': importances})
                    .sort_values('importance', ascending=False).head(20))
        plt.figure(figsize=(6,6))
        sns.barplot(data=fi_df, x='importance', y='feature')
        plt.title('Top Feature Importances')
        plt.show()
else:
    print('Skip visualization (no tuned model).')


# 10. Save Model & Artifacts
Persist the tuned model and optionally export processed data.

In [None]:
import joblib, os, datetime as dt

artifacts_dir = 'artifacts'
os.makedirs(artifacts_dir, exist_ok=True)

timestamp = dt.datetime.utcnow().strftime('%Y%m%d_%H%M%S')
if gs_best is not None:
    model_path = os.path.join(artifacts_dir, f'model_{timestamp}.joblib')
    joblib.dump(gs_best, model_path)
    print('Saved model to', model_path)
else:
    print('No tuned model to save.')

processed_path = os.path.join(artifacts_dir, f'processed_{timestamp}.csv')
if not df.empty:
    df.to_csv(processed_path, index=True)
    print('Saved processed data to', processed_path)


# 11. Next Steps
- Refine feature engineering (technical indicators, rolling stats)
- Integrate with live pipeline outputs
- Add model monitoring (drift, performance decay)
- Automate retraining schedule
