# Feature Engineering for Strawberry Price Prediction

This notebook demonstrates the feature engineering process for predicting strawberry prices.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from src.fct_eda import *
from src.fct_feature_eng import *
from src.parameter import get_dict_params

## 1. Load and Split Data

In [None]:
# Load data
df = pd.read_csv('data/raw/senior_ds_test.csv')

# Load parameters
dict_params = get_dict_params()
weather_cols = dict_params['weather_cols']
test_start_year = dict_params['test_start_year']

# Split data
train_df, test_df = split_train_test(df)

print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)

Training set shape: (443, 15)
Testing set shape: (78, 15)


## 2. Feature Engineering Process

In [None]:
# Process training data
train_processed = preprocessing(train_df, is_training=True)

# Process test data
test_processed = preprocessing(test_df, is_training=False)

  df['price_momentum'] = df['price'].pct_change()
  df['price_momentum'] = df['price'].pct_change()


In [None]:
# Scale features
train_scaled = scale_df(train_processed)
test_scaled = scale_df(test_processed)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [None]:
# Display date ranges
print("\nTraining data date range:")
print(f"Start: {train_processed['start_date'].min()}, End: {train_processed['start_date'].max()}")
print("\nTesting data date range:")
print(f"Start: {test_processed['start_date'].min()}, End: {test_processed['start_date'].max()}")


Training data date range:


NameError: name 'train_processed' is not defined

## 3. Analyze Generated Features

In [None]:
# Display feature categories
print("Temporal Features:")
temporal_cols = [col for col in train_processed.columns if any(x in col for x in ['year', 'month', 'week', 'season'])]
print(temporal_cols)

print("\nWeather Features:")
weather_cols = [col for col in train_processed.columns if any(x in col for x in ['temp', 'wind', 'cloud', 'precip', 'solar'])]
print(weather_cols)

print("\nPrice Features:")
price_cols = [col for col in train_processed.columns if 'price' in col]
print(price_cols)

## 4. Feature Importance Analysis

In [None]:
# Calculate correlations with price
correlations = train_processed.corr()['price'].sort_values(ascending=False)

# Plot top 20 correlations
plt.figure(figsize=(12, 6))
correlations[1:21].plot(kind='bar')
plt.title('Top 20 Feature Correlations with Price')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 5. Seasonal Pattern Analysis

In [None]:
# Plot seasonal patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Monthly patterns
sns.boxplot(data=train_processed, x='month', y='price', ax=axes[0,0])
axes[0,0].set_title('Price Distribution by Month')

# Seasonal patterns
sns.boxplot(data=train_processed, x='season', y='price', ax=axes[0,1])
axes[0,1].set_title('Price Distribution by Season')

# Temperature effect
sns.scatterplot(data=train_processed, x='temp', y='price', ax=axes[1,0])
axes[1,0].set_title('Price vs Temperature')

# Solar radiation effect
sns.scatterplot(data=train_processed, x='solar_cloud', y='price', ax=axes[1,1])
axes[1,1].set_title('Price vs Effective Solar Radiation')

plt.tight_layout()
plt.show()

## 6. Price Dynamics Analysis

In [None]:
# Plot price dynamics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price momentum
sns.histplot(data=train_processed, x='price_momentum', ax=axes[0,0])
axes[0,0].set_title('Price Momentum Distribution')

# Price volatility
sns.histplot(data=train_processed, x='price_volatility', ax=axes[0,1])
axes[0,1].set_title('Price Volatility Distribution')

# Price relative to 4-week average
sns.histplot(data=train_processed, x='price_rel_4w_avg', ax=axes[1,0])
axes[1,0].set_title('Price Relative to 4-week Average')

# Price relative to seasonal average
sns.histplot(data=train_processed, x='price_rel_seasonal', ax=axes[1,1])
axes[1,1].set_title('Price Relative to Seasonal Average')

plt.tight_layout()
plt.show()

## 7. Key Findings and Next Steps

1. Temporal Patterns:
   - Strong seasonal effects captured through cyclical encoding
   - Weekly patterns important for short-term predictions

2. Weather Impacts:
   - Temperature and solar radiation show significant correlations
   - Weather interactions provide additional insights

3. Price Dynamics:
   - Price momentum and volatility capture market behavior
   - Relative price features help normalize across seasons

4. Feature Importance:
   - Seasonal indicators among top predictors
   - Weather features show varying importance

Next Steps:
1. Feature selection based on correlation analysis
2. Consider polynomial features for weather interactions
3. Experiment with different lag windows
4. Validate feature importance with model-based methods