In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging

# Configure logging
logging.basicConfig(filename='../logs/pipeline.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
# Load data
def load_data(filepath):
    """Load hyperspectral data from a CSV file."""
    df = pd.read_csv(filepath)
    logging.info(f"Data loaded successfully with shape: {df.shape}")
    return df

In [None]:

# Load dataset
data_path = 'E:\\hyperspectral_don_prediction\\data\\MLE-Assignment.csv'
df = load_data(data_path)


In [None]:
# Display basic info
logging.info("Dataset Information:")
logging.info(df.info())

In [None]:

# Check for missing values
missing_values = df.isnull().sum()
logging.info("\nMissing values per column:")
logging.info(missing_values[missing_values > 0])

In [None]:
# Distribution of DON concentration
plt.figure(figsize=(8, 6))
sns.histplot(df['vomitoxin_ppb'], bins=30, kde=True, color='blue')
plt.title("Distribution of DON Concentration")
plt.xlabel("DON Concentration (ppb)")
plt.ylabel("Frequency")
plt.savefig('../images/don_distribution.png')
plt.show()

In [None]:
# Boxplot to identify outliers
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['vomitoxin_ppb'], color='orange')
plt.title("Boxplot of DON Concentration")
plt.xlabel("DON Concentration (ppb)")
plt.savefig('../images/don_boxplot.png')
plt.show()

In [None]:

# Correlation heatmap (for a subset of bands)
plt.figure(figsize=(10, 8))
sns.heatmap(df.iloc[:, 1:21].corr(), cmap='coolwarm', annot=False)
plt.title("Correlation Heatmap of First 20 Wavelength Bands")
plt.savefig('../images/correlation_heatmap.png')
plt.show()