In [None]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os
from dotenv import dotenv_values

from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

In [None]:

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']


url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'


In [None]:
engine = create_engine(url, echo=False)
my_schema = 'capstone_barstov_industries'

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))


In [None]:
with engine.begin() as conn: 
    result = conn.execute(text(f'''
                               SELECT * FROM product_analysis; 
                                '''))
    data = result.all()

### Let's create a dataframe out of that
product_analysis = pd.DataFrame(data) 
product_analysis

In [None]:
# Aggregate data by product type and color group to summarize demand
product_data = product_analysis.groupby(['product_type_no', 'colour_group_code', 'garment_group_no']).agg({
    'units_sold': 'sum'  # Aggregated demand
}).reset_index()

In [None]:
from sklearn.preprocessing import StandardScaler

# Select features for scaling
features = product_data[['units_sold', 'product_type_no', 'colour_group_code', 'garment_group_no']]
scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)

In [None]:
import hdbscan

# Initialize HDBSCAN with chosen parameters
hdbscan_clusterer = hdbscan.HDBSCAN(min_samples=5, min_cluster_size=10)
product_clusters = hdbscan_clusterer.fit_predict(scaled_data)

# Add cluster labels back to the data
product_data['cluster'] = product_clusters

In [None]:
import numpy as np

# Display cluster distribution
unique, counts = np.unique(product_clusters, return_counts=True)
cluster_distribution = dict(zip(unique, counts))
print("Cluster distribution:", cluster_distribution)


In [None]:
# Summarize average demand and characteristics by cluster
cluster_summary = product_data.groupby('cluster').mean()
print(cluster_summary)

In [None]:
# Aggregate data by cluster and month for ARIMA
cluster_time_data = product_analysis.groupby(['cluster', product_analysis['t_date'].dt.to_period('M')]).agg({
    'units_sold': 'sum'
}).reset_index()

# Rename columns for clarity
cluster_time_data.columns = ['cluster', 'month', 'units_sold']

# Convert period to datetime format for ARIMA compatibility
cluster_time_data['month'] = cluster_time_data['month'].dt.to_timestamp()


In [None]:
# Create a dictionary for time series by cluster
cluster_series_dict = {cluster: data[['month', 'units_sold']].set_index('month') 
                       for cluster, data in cluster_time_data.groupby('cluster')}


In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Example parameters for ARIMA (adjust as needed)
arima_order = (1, 1, 1)

# Dictionary to store ARIMA results for each cluster
arima_results = {}

# Loop through each cluster and fit ARIMA
for cluster, series in cluster_series_dict.items():
    # Fill missing months with 0, if any
    series = series.asfreq('M', fill_value=0)
    
    # Fit ARIMA model to each cluster’s time series
    try:
        model = ARIMA(series['units_sold'], order=arima_order)
        arima_fit = model.fit()
        
        # Save model summary or forecast to results dictionary
        arima_results[cluster] = arima_fit.summary()  # Or save arima_fit.forecast(steps=12) for predictions
        print(f"ARIMA model fit for Cluster {cluster}")
        
    except Exception as e:
        print(f"Could not fit ARIMA for Cluster {cluster}: {e}")
