# Step 3: Data Preprocessing & Feature Engineering

## Goal
Prepare BIST stock data for machine learning models by:
- Cleaning and handling missing values
- Engineering technical indicators and features
- Creating target variables for prediction
- Scaling and normalizing features
- Splitting data for training/validation

## Focus: BIST-100 Stock Analysis
We'll work with the available stock data to create features for trend prediction, volatility forecasting, and price movement classification.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Paths - Handle both notebook directory and project root
# If running from notebooks/, go up one level; if from project root, use current
current_dir = Path().resolve()
if current_dir.name == "notebooks":
    project_root = current_dir.parent
else:
    project_root = current_dir

data_raw_dir = project_root / "data" / "raw"
data_processed_dir = project_root / "data" / "processed"
reports_dir = project_root / "reports"

# Explicitly create directories using os.makedirs with absolute paths
data_processed_dir_abs = str(data_processed_dir.resolve())
reports_dir_abs = str(reports_dir.resolve())
os.makedirs(data_processed_dir_abs, exist_ok=True)
os.makedirs(reports_dir_abs, exist_ok=True)

print("‚úÖ Preprocessing setup complete!")
print(f"   Project root: {project_root}")
print(f"   Raw data dir: {data_raw_dir}")
print(f"   Processed data dir (absolute): {data_processed_dir_abs}")
print(f"   Raw data exists: {data_raw_dir.exists()}")
print(f"   Stock file exists: {(data_raw_dir / 'bist_stock_prices.csv').exists()}")
print(f"   Processed dir created: {os.path.exists(data_processed_dir_abs)}")

‚úÖ Preprocessing setup complete!


## 1. Load Stock Data

In [None]:
# Load BIST stock data
stock_file = data_raw_dir / "bist_stock_prices.csv"

print(f"üìÇ Looking for file: {stock_file}")
print(f"   File exists: {stock_file.exists()}")

if stock_file.exists():
    print(f"üìä Loading data from: {stock_file}")
    df = pd.read_csv(stock_file)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)
    
    print(f"\n   Initial dataset shape: {df.shape}")
    if 'Ticker' in df.columns:
        print(f"   Available tickers: {df['Ticker'].unique()}")
    
    # Filter for BIST-100 index if multiple tickers exist
    if 'Ticker' in df.columns:
        unique_tickers = df['Ticker'].unique()
        if 'XU100.IS' in unique_tickers:
            df = df[df['Ticker'] == 'XU100.IS'].copy().reset_index(drop=True)
            print(f"‚úÖ Loaded BIST-100 index data (XU100.IS)")
        else:
            # Use first ticker if BIST-100 not available
            # Count records per ticker and use the one with most data
            ticker_counts = df['Ticker'].value_counts()
            selected_ticker = ticker_counts.index[0]
            df = df[df['Ticker'] == selected_ticker].copy().reset_index(drop=True)
            print(f"‚ö†Ô∏è  BIST-100 (XU100.IS) not found in data.")
            print(f"   Available tickers: {', '.join(unique_tickers[:5])}...")
            print(f"   Using ticker with most data: {selected_ticker} ({ticker_counts[selected_ticker]} records)")
    else:
        print("‚úÖ Loaded data (no ticker column - single stock/index)")
    
    print(f"\n   Final dataset shape: {df.shape}")
    print(f"   Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")
    print(f"   Columns: {df.columns.tolist()}")
else:
    print("‚ùå Error: bist_stock_prices.csv not found!")
    print(f"   Expected location: {stock_file}")
    print(f"   Current working directory: {Path.cwd()}")
    print(f"   Project root: {project_root}")
    print(f"   Raw data directory: {data_raw_dir}")
    print(f"   Please ensure the file exists at the expected location.")

‚úÖ Loaded data for AKBNK.IS
   Dataset shape: (6609, 9)
   Date range: 2000-05-10 to 2026-01-16
   Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Ticker']


## 2. Data Cleaning & Missing Values

In [3]:
# Check for missing values
print("="*60)
print("MISSING VALUES CHECK")
print("="*60)
missing_count = df.isnull().sum()
missing_pct = (missing_count / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_count.index,
    'Missing Count': missing_count.values,
    'Missing %': missing_pct.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("\n‚ö†Ô∏è  Missing values found:")
    display(missing_df)
    
    # Forward fill for price data, backward fill for volume
    price_cols = ['Open', 'High', 'Low', 'Close']
    for col in price_cols:
        if col in df.columns:
            df[col] = df[col].fillna(method='ffill').fillna(method='bfill')
    
    if 'Volume' in df.columns:
        df['Volume'] = df['Volume'].fillna(method='bfill').fillna(method='ffill')
    
    print("\n‚úÖ Missing values handled (forward/backward fill)")
else:
    print("\n‚úÖ No missing values found!")

MISSING VALUES CHECK

‚úÖ No missing values found!


## 3. Feature Engineering - Technical Indicators

Creating comprehensive technical indicators for BIST-100 analysis.

In [4]:
# Calculate comprehensive technical indicators
df_features = df.copy()
df_features = df_features.sort_values('Date').reset_index(drop=True)

print("üìä Calculating technical indicators...")

# === MOVING AVERAGES ===
df_features['SMA_5'] = df_features['Close'].rolling(window=5).mean()
df_features['SMA_10'] = df_features['Close'].rolling(window=10).mean()
df_features['SMA_20'] = df_features['Close'].rolling(window=20).mean()
df_features['SMA_50'] = df_features['Close'].rolling(window=50).mean()
df_features['SMA_200'] = df_features['Close'].rolling(window=200).mean()

# Exponential Moving Averages
df_features['EMA_12'] = df_features['Close'].ewm(span=12, adjust=False).mean()
df_features['EMA_26'] = df_features['Close'].ewm(span=26, adjust=False).mean()
df_features['EMA_50'] = df_features['Close'].ewm(span=50, adjust=False).mean()

# Moving Average Crossovers
df_features['SMA_Cross_5_20'] = df_features['SMA_5'] - df_features['SMA_20']
df_features['SMA_Cross_20_50'] = df_features['SMA_20'] - df_features['SMA_50']
df_features['EMA_Cross'] = df_features['EMA_12'] - df_features['EMA_26']

# === MACD ===
df_features['MACD'] = df_features['EMA_12'] - df_features['EMA_26']
df_features['MACD_Signal'] = df_features['MACD'].ewm(span=9, adjust=False).mean()
df_features['MACD_Histogram'] = df_features['MACD'] - df_features['MACD_Signal']

# === RSI ===
def calculate_rsi(prices, period=14):
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df_features['RSI'] = calculate_rsi(df_features['Close'], period=14)
df_features['RSI_Overbought'] = (df_features['RSI'] > 70).astype(int)
df_features['RSI_Oversold'] = (df_features['RSI'] < 30).astype(int)

# === BOLLINGER BANDS ===
df_features['BB_Middle'] = df_features['Close'].rolling(window=20).mean()
bb_std = df_features['Close'].rolling(window=20).std()
df_features['BB_Upper'] = df_features['BB_Middle'] + (bb_std * 2)
df_features['BB_Lower'] = df_features['BB_Middle'] - (bb_std * 2)
df_features['BB_Width'] = df_features['BB_Upper'] - df_features['BB_Lower']
df_features['BB_Position'] = (df_features['Close'] - df_features['BB_Lower']) / (df_features['BB_Upper'] - df_features['BB_Lower'])
df_features['BB_Squeeze'] = (df_features['BB_Width'] < df_features['BB_Width'].rolling(20).mean() * 0.8).astype(int)

# === ATR (Volatility) ===
df_features['High_Low'] = df_features['High'] - df_features['Low']
df_features['High_Close'] = abs(df_features['High'] - df_features['Close'].shift())
df_features['Low_Close'] = abs(df_features['Low'] - df_features['Close'].shift())
df_features['True_Range'] = df_features[['High_Low', 'High_Close', 'Low_Close']].max(axis=1)
df_features['ATR_14'] = df_features['True_Range'].rolling(window=14).mean()
df_features['ATR_21'] = df_features['True_Range'].rolling(window=21).mean()

# === PRICE-BASED FEATURES ===
df_features['Price_Change'] = df_features['Close'].diff()
df_features['Price_Change_Pct'] = df_features['Close'].pct_change() * 100
df_features['High_Low_Pct'] = ((df_features['High'] - df_features['Low']) / df_features['Close']) * 100
df_features['Open_Close_Pct'] = ((df_features['Close'] - df_features['Open']) / df_features['Open']) * 100
df_features['Price_Position'] = (df_features['Close'] - df_features['Low']) / (df_features['High'] - df_features['Low'])

# === VOLUME INDICATORS ===
if 'Volume' in df_features.columns:
    df_features['Volume_SMA_20'] = df_features['Volume'].rolling(window=20).mean()
    df_features['Volume_Ratio'] = df_features['Volume'] / df_features['Volume_SMA_20']
    df_features['Volume_Change'] = df_features['Volume'].pct_change()
    df_features['Price_Volume_Trend'] = df_features['Price_Change_Pct'] * df_features['Volume_Ratio']

# === MOMENTUM INDICATORS ===
df_features['Momentum_5'] = df_features['Close'].pct_change(periods=5) * 100
df_features['Momentum_10'] = df_features['Close'].pct_change(periods=10) * 100
df_features['Momentum_20'] = df_features['Close'].pct_change(periods=20) * 100

# === LAG FEATURES ===
for lag in [1, 2, 3, 5, 10]:
    df_features[f'Close_Lag_{lag}'] = df_features['Close'].shift(lag)
    df_features[f'Return_Lag_{lag}'] = df_features['Price_Change_Pct'].shift(lag)

# === ROLLING STATISTICS ===
for window in [5, 10, 20]:
    df_features[f'Rolling_Std_{window}'] = df_features['Close'].rolling(window=window).std()
    df_features[f'Rolling_Mean_{window}'] = df_features['Close'].rolling(window=window).mean()
    df_features[f'Rolling_Max_{window}'] = df_features['Close'].rolling(window=window).max()
    df_features[f'Rolling_Min_{window}'] = df_features['Close'].rolling(window=window).min()

print(f"‚úÖ Technical indicators calculated!")
print(f"   Total features: {len(df_features.columns)}")
print(f"   New features added: {len(df_features.columns) - len(df.columns)}")

üìä Calculating technical indicators...
‚úÖ Technical indicators calculated!
   Total features: 72
   New features added: 63


In [5]:
# Create target variables for different prediction tasks

# 1. Next day return (regression target)
df_features['Target_Return'] = df_features['Close'].shift(-1) / df_features['Close'] - 1
df_features['Target_Return_Pct'] = df_features['Target_Return'] * 100

# 2. Next day price direction (classification target: 1 = up, 0 = down)
df_features['Target_Direction'] = (df_features['Target_Return'] > 0).astype(int)

# 3. Volatility prediction (next day ATR)
df_features['Target_Volatility'] = df_features['ATR_14'].shift(-1)

# 4. Multi-class classification (strong down, down, up, strong up)
df_features['Target_Class'] = pd.cut(
    df_features['Target_Return_Pct'],
    bins=[-np.inf, -1, 0, 1, np.inf],
    labels=[0, 1, 2, 3]  # 0=strong down, 1=down, 2=up, 3=strong up
).astype(float)

# 5. Future price (for multi-step ahead prediction)
for horizon in [5, 10, 20]:
    df_features[f'Target_Price_{horizon}d'] = df_features['Close'].shift(-horizon)
    df_features[f'Target_Return_{horizon}d'] = (df_features[f'Target_Price_{horizon}d'] / df_features['Close'] - 1) * 100

print("‚úÖ Target variables created:")
print("   - Target_Return: Next day return (regression)")
print("   - Target_Direction: Next day direction (binary classification)")
print("   - Target_Volatility: Next day volatility (regression)")
print("   - Target_Class: Multi-class movement (4 classes)")
print("   - Target_Return_5d, 10d, 20d: Multi-step ahead returns")

‚úÖ Target variables created:
   - Target_Return: Next day return (regression)
   - Target_Direction: Next day direction (binary classification)
   - Target_Volatility: Next day volatility (regression)
   - Target_Class: Multi-class movement (4 classes)
   - Target_Return_5d, 10d, 20d: Multi-step ahead returns


## 5. Handle Missing Values & Final Cleaning

In [None]:
# Remove rows with NaN values and handle infinity (from rolling windows and lag features)
initial_rows = len(df_features)
# Replace infinity with NaN, then drop
df_features = df_features.replace([np.inf, -np.inf], np.nan)
df_features = df_features.dropna().reset_index(drop=True)
final_rows = len(df_features)

print(f"üìä Data cleaning:")
print(f"   Initial rows: {initial_rows:,}")
print(f"   Final rows: {final_rows:,}")
print(f"   Removed: {initial_rows - final_rows:,} rows ({(initial_rows - final_rows)/initial_rows*100:.2f}%)")
print(f"   Date range: {df_features['Date'].min().date()} to {df_features['Date'].max().date()}")

# Check for any remaining missing values
remaining_missing = df_features.isnull().sum().sum()
if remaining_missing > 0:
    print(f"\n‚ö†Ô∏è  Warning: {remaining_missing} missing values still present")
    print(df_features.isnull().sum()[df_features.isnull().sum() > 0])
else:
    print(f"\n‚úÖ No missing values remaining!")

üìä Data cleaning:
   Initial rows: 6,609
   Final rows: 6,181
   Removed: 428 rows (6.48%)
   Date range: 2001-02-13 to 2025-12-18

‚úÖ No missing values remaining!


## 6. Feature Selection & Preparation

In [7]:
# Select features for modeling
# Exclude Date, Ticker, and target variables from features
exclude_cols = ['Date']
if 'Ticker' in df_features.columns:
    exclude_cols.append('Ticker')

# Target variables
target_cols = [col for col in df_features.columns if col.startswith('Target_')]

# Feature columns (everything except dates, tickers, and targets)
feature_cols = [col for col in df_features.columns if col not in exclude_cols + target_cols]

print(f"üìä Feature Selection:")
print(f"   Total columns: {len(df_features.columns)}")
print(f"   Feature columns: {len(feature_cols)}")
print(f"   Target columns: {len(target_cols)}")
print(f"\n   Sample features: {', '.join(feature_cols[:10])}...")
print(f"\n   Targets: {', '.join(target_cols)}")

# Create feature matrix and targets
X = df_features[feature_cols].copy()
y_return = df_features['Target_Return'].copy()
y_direction = df_features['Target_Direction'].copy()
y_volatility = df_features['Target_Volatility'].copy()
y_class = df_features['Target_Class'].copy()

print(f"\n‚úÖ Feature matrices created:")
print(f"   X shape: {X.shape}")
print(f"   y_return shape: {y_return.shape}")
print(f"   y_direction shape: {y_direction.shape}")

üìä Feature Selection:
   Total columns: 83
   Feature columns: 70
   Target columns: 11

   Sample features: Open, High, Low, Close, Volume, Dividends, Stock Splits, SMA_5, SMA_10, SMA_20...

   Targets: Target_Return, Target_Return_Pct, Target_Direction, Target_Volatility, Target_Class, Target_Price_5d, Target_Return_5d, Target_Price_10d, Target_Return_10d, Target_Price_20d, Target_Return_20d

‚úÖ Feature matrices created:
   X shape: (6181, 70)
   y_return shape: (6181,)
   y_direction shape: (6181,)


## 7. Feature Scaling

In [8]:
# Scale features for better model performance
# Use StandardScaler for most features (mean=0, std=1)
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

print("‚úÖ Features scaled using StandardScaler")
print(f"   Scaled X shape: {X_scaled.shape}")
print(f"\n   Sample statistics (first 5 features):")
print(X_scaled.iloc[:, :5].describe().T[['mean', 'std', 'min', 'max']])

ValueError: Input X contains infinity or a value too large for dtype('float64').

## 8. Train-Test Split (Time Series Aware)

In [None]:
# For time series, we split chronologically (not randomly)
# Use 80% for training, 20% for testing
split_idx = int(len(X_scaled) * 0.8)

X_train = X_scaled.iloc[:split_idx].copy()
X_test = X_scaled.iloc[split_idx:].copy()

y_train_return = y_return.iloc[:split_idx].copy()
y_test_return = y_return.iloc[split_idx:].copy()

y_train_direction = y_direction.iloc[:split_idx].copy()
y_test_direction = y_direction.iloc[split_idx:].copy()

y_train_volatility = y_volatility.iloc[:split_idx].copy()
y_test_volatility = y_volatility.iloc[split_idx:].copy()

y_train_class = y_class.iloc[:split_idx].copy()
y_test_class = y_class.iloc[split_idx:].copy()

# Get date ranges
train_dates = df_features['Date'].iloc[:split_idx]
test_dates = df_features['Date'].iloc[split_idx:]

print("="*60)
print("TRAIN-TEST SPLIT")
print("="*60)
print(f"\nüìä Training Set:")
print(f"   Samples: {len(X_train):,}")
print(f"   Date range: {train_dates.min().date()} to {train_dates.max().date()}")
print(f"   Percentage: {len(X_train)/len(X_scaled)*100:.1f}%")

print(f"\nüìä Test Set:")
print(f"   Samples: {len(X_test):,}")
print(f"   Date range: {test_dates.min().date()} to {test_dates.max().date()}")
print(f"   Percentage: {len(X_test)/len(X_scaled)*100:.1f}%")

print(f"\n‚úÖ Data split complete!")

## 9. Save Processed Data

In [None]:
# Save processed datasets
print("üíæ Saving processed data...")
print(f"   Saving to: {data_processed_dir_abs}")

# Ensure directory exists
os.makedirs(data_processed_dir_abs, exist_ok=True)

# Save full feature-engineered dataset (using absolute path)
full_dataset_path = os.path.join(data_processed_dir_abs, "bist_features_full.csv")
df_features.to_csv(full_dataset_path, index=False)
print(f"   ‚úÖ Full dataset: {full_dataset_path}")

# Save train/test splits (using absolute paths)
X_train_path = os.path.join(data_processed_dir_abs, "X_train.csv")
X_test_path = os.path.join(data_processed_dir_abs, "X_test.csv")
X_train.to_csv(X_train_path, index=False)
X_test.to_csv(X_test_path, index=False)
print(f"   ‚úÖ Feature matrices: X_train.csv, X_test.csv")

# Save targets (using absolute paths)
targets_train = pd.DataFrame({
    'Target_Return': y_train_return,
    'Target_Direction': y_train_direction,
    'Target_Volatility': y_train_volatility,
    'Target_Class': y_train_class
})
targets_test = pd.DataFrame({
    'Target_Return': y_test_return,
    'Target_Direction': y_test_direction,
    'Target_Volatility': y_test_volatility,
    'Target_Class': y_test_class
})

y_train_path = os.path.join(data_processed_dir_abs, "y_train.csv")
y_test_path = os.path.join(data_processed_dir_abs, "y_test.csv")
targets_train.to_csv(y_train_path, index=False)
targets_test.to_csv(y_test_path, index=False)
print(f"   ‚úÖ Target variables: y_train.csv, y_test.csv")

# Save feature names for reference (using absolute path)
feature_info = pd.DataFrame({
    'Feature_Name': feature_cols,
    'Feature_Type': ['Technical' if any(x in col for x in ['SMA', 'EMA', 'RSI', 'MACD', 'BB', 'ATR']) 
                     else 'Price' if col in ['Open', 'High', 'Low', 'Close']
                     else 'Volume' if 'Volume' in col
                     else 'Derived' for col in feature_cols]
})
feature_info_path = os.path.join(data_processed_dir_abs, "feature_info.csv")
feature_info.to_csv(feature_info_path, index=False)
print(f"   ‚úÖ Feature info: feature_info.csv")

# Verify files were created
print(f"\nüìÇ Verifying saved files:")
saved_files = ["bist_features_full.csv", "X_train.csv", "X_test.csv", "y_train.csv", "y_test.csv", "feature_info.csv"]
for file in saved_files:
    file_path = os.path.join(data_processed_dir_abs, file)
    if os.path.exists(file_path):
        file_size = os.path.getsize(file_path) / 1024  # Size in KB
        print(f"   ‚úÖ {file} ({file_size:.2f} KB)")
    else:
        print(f"   ‚ùå {file} - NOT FOUND!")

print(f"\n‚úÖ All processed data saved to: {data_processed_dir_abs}")

## 10. Summary & Next Steps

In [None]:
print("="*60)
print("PREPROCESSING SUMMARY")
print("="*60)

print(f"\nüìä Dataset Statistics:")
print(f"   Original rows: {initial_rows:,}")
print(f"   Processed rows: {len(df_features):,}")
print(f"   Features created: {len(feature_cols)}")
print(f"   Target variables: {len(target_cols)}")

print(f"\nüìà Feature Categories:")
feature_categories = {
    'Moving Averages': [col for col in feature_cols if 'SMA' in col or 'EMA' in col],
    'Momentum': [col for col in feature_cols if 'Momentum' in col or 'RSI' in col or 'MACD' in col],
    'Volatility': [col for col in feature_cols if 'ATR' in col or 'Std' in col or 'BB' in col],
    'Volume': [col for col in feature_cols if 'Volume' in col],
    'Price': [col for col in feature_cols if col in ['Open', 'High', 'Low', 'Close']],
    'Lags': [col for col in feature_cols if 'Lag' in col],
    'Other': [col for col in feature_cols if col not in [item for sublist in [
        [c for c in feature_cols if 'SMA' in c or 'EMA' in c],
        [c for c in feature_cols if 'Momentum' in c or 'RSI' in c or 'MACD' in c],
        [c for c in feature_cols if 'ATR' in c or 'Std' in c or 'BB' in c],
        [c for c in feature_cols if 'Volume' in c],
        [c for c in feature_cols if c in ['Open', 'High', 'Low', 'Close']],
        [c for c in feature_cols if 'Lag' in c]
    ] for item in sublist]]
}

for category, features in feature_categories.items():
    if features:
        print(f"   {category}: {len(features)} features")

print(f"\nüéØ Target Variables:")
print(f"   - Target_Return: Next day return (regression)")
print(f"   - Target_Direction: Next day direction (binary classification)")
print(f"   - Target_Volatility: Next day volatility (regression)")
print(f"   - Target_Class: Multi-class movement (4 classes)")

print(f"\nüì¶ Data Files Saved:")
print(f"   - bist_features_full.csv: Complete feature-engineered dataset")
print(f"   - X_train.csv, X_test.csv: Scaled feature matrices")
print(f"   - y_train.csv, y_test.csv: Target variables")
print(f"   - feature_info.csv: Feature metadata")

print("\n" + "="*60)
print("‚úÖ PREPROCESSING COMPLETE!")
print("="*60)
print("\nüìã Next Steps:")
print("   1. Review feature distributions and correlations")
print("   2. Train machine learning models (regression & classification)")
print("   3. Evaluate model performance on test set")
print("   4. Feature importance analysis")
print("   5. Model tuning and optimization")
print("\nüí° All processed data is ready for model training!")