# Feature Engineering for Strawberry Price Prediction

This notebook demonstrates the feature engineering process for predicting strawberry prices.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
from src.fct_eda import *
from src.fct_feature_eng import *
from src.parameter import get_dict_params

## 1. Load and Split Data

In [None]:
# Load data
data = pd.read_csv('data/raw/senior_ds_test.csv')

# Load parameters
dict_params = get_dict_params()
weather_cols = dict_params['weather_cols']

# Split data
train_data, test_data = split_train_test(data)

print("Training set shape:", train_data.shape)
print("Testing set shape:", test_data.shape)

## 2. Analyze Missing Values

In [None]:
# Display missing values before cleaning
print("Missing values before cleaning:")
print(train_data[weather_cols].isnull().sum())

# Clean data by removing rows with missing values
train_clean = handle_missing_values(train_data)
test_clean = handle_missing_values(test_data)

print("\nDataset shapes after cleaning:")
print(f"Training set: {train_clean.shape} (removed {train_data.shape[0] - train_clean.shape[0]} rows)")
print(f"Testing set: {test_clean.shape} (removed {test_data.shape[0] - test_clean.shape[0]} rows)")

## 3. Feature Engineering Process

In [None]:
# Process training data
train_processed = preprocessing(train_data, is_training=True)

# Process test data
test_processed = preprocessing(test_data, is_training=False)

In [None]:
# Scale features
train_scaled = scale_df(train_processed)
test_scaled = scale_df(test_processed)

In [None]:
# Display date ranges
print("\nTraining data date range:")
print(f"Start: {train_processed['start_date'].min()}, End: {train_processed['start_date'].max()}")
print("\nTesting data date range:")
print(f"Start: {test_processed['start_date'].min()}, End: {test_processed['start_date'].max()}")

## 4. Analyze Generated Features

In [None]:
# Display feature categories
print("Temporal Features:")
temporal_cols = [col for col in train_processed.columns if any(x in col for x in ['year', 'month', 'week', 'season'])]
print(temporal_cols)

print("\nWeather Features:")
weather_cols2 = [col for col in train_processed.columns if any(x in col for x in weather_cols)]
print(weather_cols2)

print("\nPrice Features:")
price_cols = [col for col in train_processed.columns if 'price' in col]
print(price_cols)

## 5. Feature Importance Analysis

In [None]:
# Calculate correlations with price
correlations = train_processed.select_dtypes(include='float').corr()['price'].sort_values(ascending=False)

# Plot top 20 correlations
plt.figure(figsize=(12, 6))
correlations[1:21].plot(kind='bar')
plt.title('Top 20 Feature Correlations with Price')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 6. Export Results

In [None]:
train_scaled.to_csv('data/processed/train_scaled.csv')
train_processed.to_csv('data/processed/train_processed.csv')
test_scaled.to_csv('data/processed/test_scaled.csv')
test_processed.to_csv('data/processed/test_processed.csv')

## 7. Key Findings and Next Steps

1. Data Cleaning:
   - Removed rows with missing values to ensure data quality
   - Excluded summer period (weeks 24-49) due to lack of price data

2. Temporal Patterns:
   - Strong seasonal effects captured through cyclical encoding
   - Weekly patterns important for short-term predictions

3. Weather Impacts:
   - Temperature and solar radiation show significant correlations
   - Weather interactions provide additional insights

4. Price Dynamics:
   - Price momentum and volatility capture market behavior
   - Relative price features help normalize across seasons

5. Feature Importance:
   - Seasonal indicators among top predictors
   - Weather features show varying importance

Next Steps:
1. Feature selection based on correlation analysis
2. Consider polynomial features for weather interactions
3. Experiment with different lag windows
4. Validate feature importance with model-based methods