In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow import keras
from tensorflow.keras import layers


# 1. Data Ingestion & Merging

print("Loading datasets...")
# Load CSV files (Ensure these are in your working directory)
df_train = pd.read_csv('train.csv')
df_stores = pd.read_csv('stores.csv')
df_oil = pd.read_csv('oil.csv')
df_holidays = pd.read_csv('holidays_events.csv')


# Convert date columns to datetime objects
df_train['date'] = pd.to_datetime(df_train['date'])
df_oil['date'] = pd.to_datetime(df_oil['date'])


# Merge data to create one master dataframe
# Merge with Oil prices
df = pd.merge(df_train, df_oil, on='date', how='left')
# Merge with Stores data
df = pd.merge(df, df_stores, on='store_nbr', how='left')

# We filter for Store Number 1 to make this run efficiently on CPU.
df = df[df['store_nbr'] == 1].copy()

# Imputation: Fill missing oil values (Forward Fill then Backward Fill)
df['dcoilwtico'] = df['dcoilwtico'].ffill().bfill()

print(f"Data Loaded. Shape: {df.shape}")



# 2. Exploratory Data Analysis (EDA)

# 2.1 Sales over time
plt.figure(figsize=(12, 6))
df.groupby('date')['sales'].sum().plot(color='purple')
plt.title('Total Sales Over Time (Store 1)')
plt.ylabel('Sales')
plt.xlabel('Date')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show() #

# 2.2 Distribution of Sales
plt.figure(figsize=(8, 6))
plt.hist(df['sales'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Sales Target')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()

# 2.3 Sales by Product Family
plt.figure(figsize=(12, 6))
top_families = df.groupby('family')['sales'].sum().sort_values(ascending=False).head(10)
sns.barplot(x=top_families.values, y=top_families.index, palette='viridis')
plt.title('Top 10 Product Families by Sales')
plt.show()