In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.style.use('seaborn-v0_8')

# Load the data
data_path = "../data/raw data/dambulla_daily_vegetable_prices_2010_2025 (1).csv"

# Try different encodings if needed
try:
    df = pd.read_csv(data_path)
    print("Data loaded successfully with default encoding")
except UnicodeDecodeError:
    df = pd.read_csv(data_path, encoding='latin-1')
    print("Data loaded successfully with latin-1 encoding")

print(f"Data shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Display first few rows
print("First 10 rows of the dataset:")
print(df.head(10))
print("\n" + "="*80 + "\n")

# Display basic info
print("Data Information:")
print(df.info())
print("\n" + "="*80 + "\n")

# Check column names
print("Column names:")
print(df.columns.tolist())
print("\n" + "="*80 + "\n")

# Check data types
print("Data types:")
print(df.dtypes)
print("\n" + "="*80 + "\n")

In [None]:
# Basic statistics for numerical columns
print("Descriptive Statistics for Numerical Columns:")
print(df.describe())
print("\n" + "="*80 + "\n")

# For categorical columns
print("Descriptive Statistics for Categorical Columns:")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(f"Unique values: {df[col].nunique()}")
    print(f"Top 5 values: {df[col].value_counts().head().to_dict()}")

print("\n" + "="*80 + "\n")

In [None]:
# Check for missing values
print("Missing Values Analysis:")
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_data,
    'Percentage': missing_percentage
})
print(missing_df[missing_df['Missing_Count'] > 0])

print(f"\nTotal missing values: {df.isnull().sum().sum()}")
print(f"Percentage of missing values overall: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")

print("\n" + "="*80 + "\n")

In [None]:
# Convert Date to datetime (your dates are in '2010-01-01' format)
print("Original Date dtype:", df['Date'].dtype)
print("First few Date values:")
print(df['Date'].head())

# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')
print(f"\nAfter conversion - Date dtype: {df['Date'].dtype}")

# Check for any conversion errors
null_dates = df['Date'].isnull().sum()
print(f"Failed date conversions: {null_dates}")
if null_dates > 0:
    print("Sample of rows with invalid dates:")
    print(df[df['Date'].isnull()].head())

# Analyze date range
print(f"\nDate range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Total days in dataset: {(df['Date'].max() - df['Date'].min()).days} days")

# Create temporal features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.day_name()
df['MonthName'] = df['Date'].dt.month_name()

print("\nTemporal Distribution:")
print("\nYearly distribution:")
year_counts = df['Year'].value_counts().sort_index()
print(year_counts)

# Plot yearly distribution
plt.figure(figsize=(10, 5))
year_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Number of Records by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nMonthly distribution (across all years):")
month_counts = df['Month'].value_counts().sort_index()
print(month_counts)

# Map month numbers to names for better readability
month_names = {
    1: 'January', 2: 'February', 3: 'March', 4: 'April',
    5: 'May', 6: 'June', 7: 'July', 8: 'August',
    9: 'September', 10: 'October', 11: 'November', 12: 'December'
}
print("\nMonthly distribution with names:")
for month_num, count in month_counts.items():
    print(f"{month_names.get(month_num, 'Unknown')} ({month_num}): {count}")

# Plot monthly distribution
plt.figure(figsize=(12, 5))
month_counts.index = month_counts.index.map(lambda x: month_names.get(x, x))
month_counts.plot(kind='bar', color='lightcoral', edgecolor='black')
plt.title('Number of Records by Month (Across All Years)')
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nDay of week distribution:")
day_counts = df['DayOfWeek'].value_counts()
print(day_counts)

# Reorder days for better visualization
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = day_counts.reindex(day_order)

# Plot day of week distribution
plt.figure(figsize=(8, 5))
day_counts.plot(kind='bar', color='lightgreen', edgecolor='black')
plt.title('Number of Records by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Additional temporal insights
print("\nAdditional Temporal Insights:")
print(f"\nEarliest date: {df['Date'].min()}")
print(f"Latest date: {df['Date'].max()}")
print(f"Total unique dates: {df['Date'].nunique()}")

# Check for date gaps
date_range = pd.date_range(start=df['Date'].min(), end=df['Date'].max(), freq='D')
missing_dates = date_range.difference(df['Date'])
print(f"Missing dates in the range: {len(missing_dates)}")
if len(missing_dates) > 0:
    print("Sample of missing dates:")
    print(missing_dates[:10])

# Monthly records per year
print("\nMonthly records per year (pivot table):")
year_month_pivot = pd.crosstab(df['Year'], df['Month'])
print(year_month_pivot)

# Heatmap of records by year and month
plt.figure(figsize=(12, 8))
sns.heatmap(year_month_pivot, annot=True, fmt='d', cmap='YlOrRd',
            linewidths=0.5, linecolor='gray')
plt.title('Number of Records by Year and Month')
plt.xlabel('Month')
plt.ylabel('Year')
plt.tight_layout()
plt.show()

print("\n" + "="*80 + "\n")

In [None]:
# Analyze Vegetable_Name
print("Vegetable Name Analysis:")
print(f"Total unique vegetables: {df['Vegetable_Name'].nunique()}")
print("\nTop 10 most frequent vegetables:")
print(df['Vegetable_Name'].value_counts().head(10))

print("\nVegetable distribution (top 15):")
plt.figure(figsize=(12, 6))
df['Vegetable_Name'].value_counts().head(15).plot(kind='bar')
plt.title('Top 15 Vegetable Frequencies')
plt.xlabel('Vegetable Name')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Analyze Season
print("\nSeason Analysis:")
print(df['Season'].value_counts())

# Analyze Supply_Status
print("\nSupply Status Analysis:")
print(df['Supply_Status'].value_counts())

print("\n" + "="*80 + "\n")

In [None]:
# Select numerical columns
numerical_cols = ['Daily_Arrival_MT', 'Estimated_Sales_MT', 'Wholesale_Price_Rs_kg']

# Plot distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col].dropna(), bins=50, alpha=0.7, edgecolor='black')
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    # Add vertical line for mean
    axes[idx].axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.2f}')
    axes[idx].legend()

plt.tight_layout()
plt.show()

# Boxplots for outliers detection
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for idx, col in enumerate(numerical_cols):
    axes[idx].boxplot(df[col].dropna())
    axes[idx].set_title(f'Boxplot of {col}')
    axes[idx].set_ylabel(col)

plt.tight_layout()
plt.show()

print("\n" + "="*80 + "\n")

In [None]:
# Calculate correlations between numerical variables
print("Correlation Matrix:")
correlation_matrix = df[numerical_cols].corr()
print(correlation_matrix)

# Visualize correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.show()

# Scatter plots for key relationships
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Daily Arrival vs Estimated Sales
axes[0].scatter(df['Daily_Arrival_MT'], df['Estimated_Sales_MT'], alpha=0.5)
axes[0].set_xlabel('Daily Arrival (MT)')
axes[0].set_ylabel('Estimated Sales (MT)')
axes[0].set_title('Daily Arrival vs Estimated Sales')

# Price vs Arrival
axes[1].scatter(df['Daily_Arrival_MT'], df['Wholesale_Price_Rs_kg'], alpha=0.5)
axes[1].set_xlabel('Daily Arrival (MT)')
axes[1].set_ylabel('Wholesale Price (Rs/kg)')
axes[1].set_title('Daily Arrival vs Price')

plt.tight_layout()
plt.show()

print("\n" + "="*80 + "\n")

In [None]:
print("SEASONALITY AND TIME SERIES ANALYSIS")
print("="*80)

# Check if Date column exists and is datetime
if 'Date' in df.columns:
    print(f"Date column dtype: {df['Date'].dtype}")

    # Ensure Date is datetime
    if df['Date'].dtype != 'datetime64[ns]':
        df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')
        print(f"Converted Date to datetime64[ns]")

    # Check if we have Year and Month columns from previous chunk
    if 'Year' not in df.columns or 'Month' not in df.columns:
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        print(f"Created Year and Month columns")

    # Aggregate data by month for overview
    print("\nAggregating data by month...")
    monthly_data = df.groupby(['Year', 'Month']).agg({
        'Daily_Arrival_MT': ['mean', 'sum', 'std', 'min', 'max'],
        'Estimated_Sales_MT': ['mean', 'sum', 'std', 'min', 'max'],
        'Wholesale_Price_Rs_kg': ['mean', 'std', 'min', 'max']
    }).reset_index()

    # Flatten column names
    monthly_data.columns = ['Year', 'Month',
                            'Arrival_Mean', 'Arrival_Sum', 'Arrival_Std', 'Arrival_Min', 'Arrival_Max',
                            'Sales_Mean', 'Sales_Sum', 'Sales_Std', 'Sales_Min', 'Sales_Max',
                            'Price_Mean', 'Price_Std', 'Price_Min', 'Price_Max']

    print(f"\nMonthly aggregated data shape: {monthly_data.shape}")
    print("\nFirst few rows of monthly aggregated data:")
    print(monthly_data.head())

    # Create datetime index for plotting
    monthly_data['YearMonth'] = pd.to_datetime(
        monthly_data['Year'].astype(str) + '-' + monthly_data['Month'].astype(str).str.zfill(2),
        format='%Y-%m'
    )

    # Sort by date
    monthly_data = monthly_data.sort_values('YearMonth')

    # Plot time series
    fig, axes = plt.subplots(3, 1, figsize=(16, 12))

    # Plot 1: Daily Arrival (MT)
    axes[0].plot(monthly_data['YearMonth'], monthly_data['Arrival_Mean'],
                 marker='o', markersize=4, linewidth=2, color='blue', label='Mean')
    axes[0].fill_between(monthly_data['YearMonth'],
                         monthly_data['Arrival_Mean'] - monthly_data['Arrival_Std'],
                         monthly_data['Arrival_Mean'] + monthly_data['Arrival_Std'],
                         alpha=0.2, color='blue', label='±1 Std Dev')
    axes[0].set_title('Monthly Average Daily Arrival (MT) with Standard Deviation', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Date', fontsize=12)
    axes[0].set_ylabel('Daily Arrival (MT)', fontsize=12)
    axes[0].grid(True, alpha=0.3)
    axes[0].legend()
    axes[0].tick_params(axis='x', rotation=45)

    # Plot 2: Estimated Sales (MT)
    axes[1].plot(monthly_data['YearMonth'], monthly_data['Sales_Mean'],
                 marker='s', markersize=4, linewidth=2, color='green', label='Mean')
    axes[1].fill_between(monthly_data['YearMonth'],
                         monthly_data['Sales_Mean'] - monthly_data['Sales_Std'],
                         monthly_data['Sales_Mean'] + monthly_data['Sales_Std'],
                         alpha=0.2, color='green', label='±1 Std Dev')
    axes[1].set_title('Monthly Average Estimated Sales (MT) with Standard Deviation', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Date', fontsize=12)
    axes[1].set_ylabel('Estimated Sales (MT)', fontsize=12)
    axes[1].grid(True, alpha=0.3)
    axes[1].legend()
    axes[1].tick_params(axis='x', rotation=45)

    # Plot 3: Wholesale Price (Rs/kg)
    axes[2].plot(monthly_data['YearMonth'], monthly_data['Price_Mean'],
                 marker='^', markersize=4, linewidth=2, color='red', label='Mean')
    axes[2].fill_between(monthly_data['YearMonth'],
                         monthly_data['Price_Mean'] - monthly_data['Price_Std'],
                         monthly_data['Price_Mean'] + monthly_data['Price_Std'],
                         alpha=0.2, color='red', label='±1 Std Dev')
    axes[2].set_title('Monthly Average Wholesale Price (Rs/kg) with Standard Deviation', fontsize=14, fontweight='bold')
    axes[2].set_xlabel('Date', fontsize=12)
    axes[2].set_ylabel('Wholesale Price (Rs/kg)', fontsize=12)
    axes[2].grid(True, alpha=0.3)
    axes[2].legend()
    axes[2].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

    # Additional analysis: Seasonal patterns
    print("\n" + "="*80)
    print("SEASONAL ANALYSIS BY MONTH")
    print("="*80)

    # Calculate average by month across all years
    seasonal_pattern = df.groupby('Month').agg({
        'Daily_Arrival_MT': 'mean',
        'Estimated_Sales_MT': 'mean',
        'Wholesale_Price_Rs_kg': 'mean'
    }).reset_index()

    seasonal_pattern = seasonal_pattern.rename(columns={
        'Daily_Arrival_MT': 'Avg_Arrival_MT',
        'Estimated_Sales_MT': 'Avg_Sales_MT',
        'Wholesale_Price_Rs_kg': 'Avg_Price_Rs_kg'
    })

    # Add month names
    month_names = {
        1: 'January', 2: 'February', 3: 'March', 4: 'April',
        5: 'May', 6: 'June', 7: 'July', 8: 'August',
        9: 'September', 10: 'October', 11: 'November', 12: 'December'
    }
    seasonal_pattern['Month_Name'] = seasonal_pattern['Month'].map(month_names)

    print("\nAverage values by month (across all years):")
    print(seasonal_pattern[['Month_Name', 'Avg_Arrival_MT', 'Avg_Sales_MT', 'Avg_Price_Rs_kg']].round(2))

    # Plot seasonal patterns
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # Arrival by month
    axes[0].bar(seasonal_pattern['Month_Name'], seasonal_pattern['Avg_Arrival_MT'],
                color='blue', alpha=0.7, edgecolor='black')
    axes[0].set_title('Average Daily Arrival by Month', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Month')
    axes[0].set_ylabel('Average Arrival (MT)')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(axis='y', alpha=0.3)

    # Sales by month
    axes[1].bar(seasonal_pattern['Month_Name'], seasonal_pattern['Avg_Sales_MT'],
                color='green', alpha=0.7, edgecolor='black')
    axes[1].set_title('Average Estimated Sales by Month', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Month')
    axes[1].set_ylabel('Average Sales (MT)')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(axis='y', alpha=0.3)

    # Price by month
    axes[2].bar(seasonal_pattern['Month_Name'], seasonal_pattern['Avg_Price_Rs_kg'],
                color='red', alpha=0.7, edgecolor='black')
    axes[2].set_title('Average Wholesale Price by Month', fontsize=12, fontweight='bold')
    axes[2].set_xlabel('Month')
    axes[2].set_ylabel('Average Price (Rs/kg)')
    axes[2].tick_params(axis='x', rotation=45)
    axes[2].grid(axis='y', alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Year-over-Year comparison
    print("\n" + "="*80)
    print("YEAR-OVER-YEAR COMPARISON")
    print("="*80)

    # Calculate average by year
    yearly_data = df.groupby('Year').agg({
        'Daily_Arrival_MT': 'mean',
        'Estimated_Sales_MT': 'mean',
        'Wholesale_Price_Rs_kg': 'mean'
    }).reset_index()

    yearly_data = yearly_data.rename(columns={
        'Daily_Arrival_MT': 'Avg_Arrival_MT',
        'Estimated_Sales_MT': 'Avg_Sales_MT',
        'Wholesale_Price_Rs_kg': 'Avg_Price_Rs_kg'
    })

    print("\nAverage values by year:")
    print(yearly_data.round(2))

    # Plot year-over-year comparison
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    # Yearly Arrival
    axes[0].plot(yearly_data['Year'], yearly_data['Avg_Arrival_MT'],
                 marker='o', linewidth=2, color='blue')
    axes[0].set_title('Yearly Average Daily Arrival', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Year')
    axes[0].set_ylabel('Average Arrival (MT)')
    axes[0].grid(True, alpha=0.3)
    axes[0].tick_params(axis='x', rotation=45)

    # Yearly Sales
    axes[1].plot(yearly_data['Year'], yearly_data['Avg_Sales_MT'],
                 marker='s', linewidth=2, color='green')
    axes[1].set_title('Yearly Average Estimated Sales', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Year')
    axes[1].set_ylabel('Average Sales (MT)')
    axes[1].grid(True, alpha=0.3)
    axes[1].tick_params(axis='x', rotation=45)

    # Yearly Price
    axes[2].plot(yearly_data['Year'], yearly_data['Avg_Price_Rs_kg'],
                 marker='^', linewidth=2, color='red')
    axes[2].set_title('Yearly Average Wholesale Price', fontsize=12, fontweight='bold')
    axes[2].set_xlabel('Year')
    axes[2].set_ylabel('Average Price (Rs/kg)')
    axes[2].grid(True, alpha=0.3)
    axes[2].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    plt.show()

    # Summary statistics
    print("\n" + "="*80)
    print("TIME SERIES SUMMARY STATISTICS")
    print("="*80)

    print(f"\nOverall trends (2010-2024):")
    print(f"Arrival - Mean: {monthly_data['Arrival_Mean'].mean():.2f} MT, Range: {monthly_data['Arrival_Mean'].min():.2f} to {monthly_data['Arrival_Mean'].max():.2f} MT")
    print(f"Sales   - Mean: {monthly_data['Sales_Mean'].mean():.2f} MT, Range: {monthly_data['Sales_Mean'].min():.2f} to {monthly_data['Sales_Mean'].max():.2f} MT")
    print(f"Price   - Mean: {monthly_data['Price_Mean'].mean():.2f} Rs/kg, Range: {monthly_data['Price_Mean'].min():.2f} to {monthly_data['Price_Mean'].max():.2f} Rs/kg")

    # Calculate year-over-year changes
    yearly_data['Arrival_Change'] = yearly_data['Avg_Arrival_MT'].pct_change() * 100
    yearly_data['Sales_Change'] = yearly_data['Avg_Sales_MT'].pct_change() * 100
    yearly_data['Price_Change'] = yearly_data['Avg_Price_Rs_kg'].pct_change() * 100

    print("\nYear-over-Year Percentage Changes (where available):")
    for idx, row in yearly_data.iterrows():
        if idx > 0:  # Skip first year
            print(f"{int(row['Year'])}: Arrival: {row['Arrival_Change']:+.1f}%, Sales: {row['Sales_Change']:+.1f}%, Price: {row['Price_Change']:+.1f}%")

else:
    print("Date column not found in the dataset")

print("\n" + "="*80 + "\n")

In [None]:
# Analyze numerical variables by Supply_Status
print("Statistics by Supply Status:")
if 'Supply_Status' in df.columns:
    for col in numerical_cols:
        print(f"\n{col} by Supply Status:")
        stats_by_status = df.groupby('Supply_Status')[col].agg(['mean', 'median', 'std', 'min', 'max'])
        print(stats_by_status)

    # Boxplots by Supply Status
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    for idx, col in enumerate(numerical_cols):
        df.boxplot(column=col, by='Supply_Status', ax=axes[idx])
        axes[idx].set_title(f'{col} by Supply Status')
        axes[idx].set_ylabel(col)
        axes[idx].set_xlabel('Supply Status')

    plt.suptitle('')
    plt.tight_layout()
    plt.show()

print("\n" + "="*80 + "\n")

In [None]:
# Check for duplicate rows
print("Duplicate Analysis:")
total_duplicates = df.duplicated().sum()
print(f"Total duplicate rows: {total_duplicates}")
print(f"Percentage of duplicates: {(total_duplicates / len(df)) * 100:.2f}%")

# Check for duplicate dates per vegetable (if applicable)
if 'Date' in df.columns and 'Vegetable_Name' in df.columns:
    date_veg_duplicates = df.duplicated(subset=['Date', 'Vegetable_Name']).sum()
    print(f"\nDuplicate Date-Vegetable combinations: {date_veg_duplicates}")
    if date_veg_duplicates > 0:
        print("\nSample duplicate entries:")
        duplicates = df[df.duplicated(subset=['Date', 'Vegetable_Name'], keep=False)]
        print(duplicates.head(10))

print("\n" + "="*80 + "\n")