In [None]:
import pandas as pd
barstov = pd.read_csv('./Capstone Project/Data/barstov_merged.csv', sep=",")

In [20]:
barstov.customer_id.nunique()

1346520

In [21]:
barstov.article_id.nunique()

104513

In [27]:
barstov.article_id.count()

np.int64(28682331)

In [34]:
model_data_week.total_units_sold.sum()

np.int64(28682331)

In [6]:
import pandas as pd

# Step 1: Create `barstov_merged_weekly`
barstov['week'] = pd.to_datetime(barstov['t_dat']).dt.to_period('W').dt.start_time  # Truncate to start of week (Monday)
barstov_merged_weekly = barstov[[
    'product_type_no',
    'product_type_name',
    'colour_group_code',
    'colour_group_name',
    'week',
    'price',
    'article_id',
    'customer_id'
]].copy()

# Step 2: Create indexes (optional in pandas)
# pandas does not have database-style indexing, but you can sort by these columns for performance
barstov_merged_weekly.sort_values(by=['product_type_no', 'colour_group_code', 'week'], inplace=True)

# Step 3: Aggregate data into `model_data_week`
model_data_week = barstov_merged_weekly.groupby(
    ['product_type_no', 'product_type_name', 'colour_group_code', 'colour_group_name', 'week']
).agg(
    average_price=('price', 'mean'),
    total_units_sold=('article_id', 'count'),
    unique_customers=('customer_id', 'nunique')
).reset_index()

# Optional: Display the result
print(model_data_week)


        product_type_no product_type_name  colour_group_code  \
0                    -1           Unknown                  6   
1                    -1           Unknown                  6   
2                    -1           Unknown                  6   
3                    -1           Unknown                  6   
4                    -1           Unknown                  6   
...                 ...               ...                ...   
172972              762       Zipper head                  9   
172973              762       Zipper head                  9   
172974              762       Zipper head                  9   
172975              762       Zipper head                  9   
172976              762       Zipper head                  9   

       colour_group_name       week  average_price  total_units_sold  \
0             Light Grey 2019-03-25       0.033529               174   
1             Light Grey 2019-04-01       0.031347               236   
2             L

In [9]:
model_data_week.to_csv('model_data_week_correct.csv')

In [8]:
model_data_week['total_units_sold'].sum()

np.int64(28682331)

In [5]:
import pandas as pd
import numpy as np

# Step 1: Basic aggregation to create the product_analysis DataFrame
product_analysis = (
    barstov.groupby([
        'article_id', 'product_type_no', 'product_type_name', 'product_group_name',
        'colour_group_code', 'colour_group_name', 'index_code', 'index_name',
        'section_name', 'garment_group_no'
    ])
    .agg(
        unique_customers=('customer_id', 'nunique'),  # Unique customers per product
        total_sales=('price', 'sum'),                # Total sales amount for each product
        average_price=('price', 'mean')              # Average price per product
    )
    .reset_index()
)

# Step 2: Add month, week, and units_sold columns
# Extract month and week from the original barstov DataFrame
barstov['month'] = pd.to_datetime(barstov['t_dat']).dt.to_period('M').dt.start_time
barstov['week'] = pd.to_datetime(barstov['t_dat']).dt.to_period('W').dt.start_time

# Aggregating units_sold for each article_id
aggregated_data = (
    barstov.groupby(['article_id', 'month', 'week'])
    .size()  # Count occurrences (units sold)
    .reset_index(name='units_sold')
)

# Merge aggregated data into product_analysis
product_analysis = product_analysis.merge(
    aggregated_data[['article_id', 'month', 'week', 'units_sold']],
    on='article_id',
    how='left'
)

# Step 3: Calculate and update average customer age
avg_customer_age = (
    barstov.groupby('article_id')
    .agg(avg_customer_age=('age', 'mean'))  # Average customer age per article_id
    .reset_index()
)

# Merge average customer age into product_analysis
product_analysis = product_analysis.merge(
    avg_customer_age,
    on='article_id',
    how='left'
)

# Step 4: Update postal code for each article, ensuring uniqueness
postal_code_data = (
    barstov.groupby('article_id')
    .agg(postal_code=('postal_code', 'first'))  # Assuming one postal code per article
    .reset_index()
)

# Merge postal code data into product_analysis
product_analysis = product_analysis.merge(
    postal_code_data,
    on='article_id',
    how='left'
)

# Optional: Fill NaN values if needed
product_analysis.fillna({'units_sold': 0, 'avg_customer_age': np.nan, 'postal_code': 'Unknown'}, inplace=True)
product_analysis.to_csv('product_analysis_correct.csv')

# Output the resulting product_analysis DataFrame
print(product_analysis)


         article_id  product_type_no product_type_name  product_group_name  \
0         108775015              253          Vest top  Garment Upper body   
1         108775015              253          Vest top  Garment Upper body   
2         108775015              253          Vest top  Garment Upper body   
3         108775015              253          Vest top  Garment Upper body   
4         108775015              253          Vest top  Garment Upper body   
...             ...              ...               ...                 ...   
2423206   953763001              253          Vest top  Garment Upper body   
2423207   953763001              253          Vest top  Garment Upper body   
2423208   956217002              265             Dress   Garment Full body   
2423209   956217002              265             Dress   Garment Full body   
2423210   956217002              265             Dress   Garment Full body   

         colour_group_code colour_group_name index_code  index_

In [36]:
product_analysis.article_id.nunique()

104513

In [38]:
product_analysis.units_sold.sum()

np.int64(28682331)

In [37]:
product_analysis.head(1)

Unnamed: 0,article_id,product_type_no,product_type_name,product_group_name,colour_group_code,colour_group_name,index_code,index_name,section_name,garment_group_no,unique_customers,total_sales,average_price,month,week,units_sold,avg_customer_age,postal_code
0,108775015,253,Vest top,Garment Upper body,9,Black,A,Ladieswear,Womens Everyday Basics,1002,6828,60.603627,0.008109,2018-09-01,2018-09-17,80,34.316296,e15819a7b5a739bde5002a217d27e049695afb433924ce...


In [39]:
import pandas as pd

# Step 1: Create the seasonality_analysis DataFrame with basic aggregation
barstov['sale_date'] = pd.to_datetime(barstov['t_dat']).dt.date  # Convert to date format for daily analysis
barstov['day'] = pd.to_datetime(barstov['t_dat']).dt.day         # Extract day
barstov['week'] = pd.to_datetime(barstov['t_dat']).dt.isocalendar().week  # Extract week
barstov['month'] = pd.to_datetime(barstov['t_dat']).dt.to_period('M').dt.start_time  # Truncate to month

seasonality_analysis = (
    barstov.groupby([
        'sale_date', 'day', 'week', 'month', 'article_id', 'product_type_name', 'product_group_name'
    ])
    .agg(
        total_sales=('price', 'sum'),           # Sum of sales
        total_orders=('customer_id', 'count'), # Count of orders
        average_price=('price', 'mean')        # Average price
    )
    .reset_index()
)

# Step 2: Add and populate the `season` column
seasonality_analysis['season'] = seasonality_analysis['month'].dt.month.map(
    lambda x: 'Winter' if x in [12, 1, 2] else
              'Spring' if x in [3, 4, 5] else
              'Summer' if x in [6, 7, 8] else
              'Fall'
)

# Step 3: Add a customer_id column (default value, as not part of initial aggregation)
seasonality_analysis['customer_id'] = None  # Placeholder column for now

# Step 4: Add postal_code for each customer by merging with aggregated postal code data
postal_code_data = (
    barstov.groupby('customer_id')
    .agg(postal_code=('postal_code', 'first'))  # Assuming one postal code per customer
    .reset_index()
)

# Merge postal code information into seasonality_analysis
seasonality_analysis = seasonality_analysis.merge(
    barstov[['customer_id', 'postal_code']],
    on='customer_id',
    how='left'
)

# Fill missing values for postal_code (if any)
seasonality_analysis['postal_code'].fillna('Unknown', inplace=True)

# Optional: Sort the DataFrame by sale_date
seasonality_analysis.sort_values(by='sale_date', inplace=True)

# Display the resulting DataFrame
print(seasonality_analysis)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  seasonality_analysis['postal_code'].fillna('Unknown', inplace=True)


          sale_date  day  week      month  article_id product_type_name  \
0        2018-09-20   20    38 2018-09-01   108775015          Vest top   
7668     2018-09-20   20    38 2018-09-01   636392002               Bra   
7669     2018-09-20   20    38 2018-09-01   636407001          Vest top   
7670     2018-09-20   20    38 2018-09-01   636407002          Vest top   
7671     2018-09-20   20    38 2018-09-01   636418001             Skirt   
...             ...  ...   ...        ...         ...               ...   
7420457  2020-09-22   22    39 2020-09-01   796556003           T-shirt   
7420456  2020-09-22   22    39 2020-09-01   796454005               Bra   
7420455  2020-09-22   22    39 2020-09-01   796241001             Dress   
7420468  2020-09-22   22    39 2020-09-01   797529001  Underwear bottom   
7425766  2020-09-22   22    39 2020-09-01   953763001          Vest top   

         product_group_name  total_sales  total_orders  average_price season  \
0        Garment Up

In [40]:
seasonality_analysis.article_id.nunique()


104513

In [41]:
seasonality_analysis.total_orders.sum()

np.int64(28682331)

In [10]:
import pandas as pd

# Step 1: Filter the DataFrame for black trousers
black_trousers = barstov[
    (barstov['product_type_name'] == 'Trousers') & 
    (barstov['colour_group_name'] == 'Black')
].copy()

# Step 2: Add necessary columns for weekly aggregation and month extraction
black_trousers['week'] = pd.to_datetime(black_trousers['t_dat']).dt.to_period('W').dt.start_time  # Truncate to weekly level
black_trousers['month'] = pd.to_datetime(black_trousers['t_dat']).dt.month  # Extract the month

# Step 3: Perform the aggregation
black_trousers_forecast = (
    black_trousers.groupby(['article_id', 'week', 'price', 'product_type_no', 'colour_group_code', 'month'])
    .agg(
        units_sold=('article_id', 'count'),  # Count rows as units sold
        average_price=('price', 'mean')     # Average price
    )
    .reset_index()
)

# Step 4: Scale the average price by a factor of 10
black_trousers_forecast['average_price'] *= 10

# Display the resulting DataFrame
print(black_trousers_forecast)


        article_id       week     price  product_type_no  colour_group_code  \
0        118458028 2018-09-17  0.033881              272                  9   
1        118458028 2018-09-24  0.010153              272                  9   
2        118458028 2018-09-24  0.027102              272                  9   
3        118458028 2018-09-24  0.028797              272                  9   
4        118458028 2018-09-24  0.033881              272                  9   
...            ...        ...       ...              ...                ...   
312967   949198001 2020-09-14  0.024814              272                  9   
312968   949198001 2020-09-14  0.025407              272                  9   
312969   949198001 2020-09-21  0.021593              272                  9   
312970   949198001 2020-09-21  0.025220              272                  9   
312971   949198001 2020-09-21  0.025407              272                  9   

        month  units_sold  average_price  
0       

In [11]:
black_trousers_forecast.units_sold.sum()

np.int64(1307832)

In [13]:
black_trousers_forecast.to_csv('black_throusers_forecast_correct.csv')