In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
import os

In [None]:
file_path = r'/Users/behnam/Library/CloudStorage/OneDrive-McMasterUniversity/Works/data_for_macc_without_index_names_2024- revised.csv'
base = pd.read_csv(file_path)
base['FISCAL_DATE'] = pd.to_datetime(base['FISCAL_DATE'])
basic = base.set_index('FISCAL_DATE')
basic

In [None]:
basic.shape

In [None]:
missing_values = basic[basic.isna().any(axis=1)]
print("Rows with NaN values:\n", missing_values)

In [None]:
basic = basic.dropna()

In [None]:
basic.shape

In [None]:
basic.columns = basic.columns.str.strip()
basic.loc[basic['$ Gross booking Value'] < 0, '$ Gross booking Value'] = 0
basic.loc[basic[' Revenue'] < 0, ' Revenue'] = 0
basic

In [None]:
basic['$ Profit'].plot(figsize=(15, 5))
plt.title('Time Series Plot of Profit')
plt.xlabel('Date')
plt.ylabel('Values')
plt.show()

In [None]:
df_seasonal = basic.copy()
df_seasonal['Year'] = df_seasonal.index.year
df_seasonal['Month'] = df_seasonal.index.month

pivot_table = df_seasonal.pivot_table(values='$ Profit', index='Year', columns='Month', aggfunc='mean')

sns.heatmap(pivot_table, annot=True, fmt=".1f", linewidths=.5)
plt.title('Seasonal Subseries Plot')
plt.xlabel('Month')
plt.ylabel('Year')
plt.show()

In [None]:
spring_data = basic[basic.index.month.isin([3, 4, 5])]
summer_data = basic[basic.index.month.isin([6, 7, 8])]
autumn_data = basic[basic.index.month.isin([9, 10, 11])]
winter_data = basic[basic.index.month.isin([12, 1, 2])]

In [None]:
june_2022_data = basic[(basic.index.month == 6) & (basic.index.year == 2022)]
june_2022_data.to_csv('C:\\Users\\Alavis1\\Documents\\Post doc\\Works\\Visuals\\june_2022_data.csv', index=True)
june_2022_data

In [None]:
def save_data_subsets(data, combinations, output_directory):
    os.makedirs(output_directory, exist_ok=True)  
    for comb in combinations:
        subset = data[(data['brand_index'] == comb[0]) & 
                      (data['Point of Sale Country (Customer Location)'] == comb[1]) &
                      (data['Point of Sale Country (Region Location)'] == comb[2]) &
                      (data['Marketing Sub Channel'] == comb[3])]
        if not subset.empty:
            filename = f'subset_brand{comb[0]}_loc{comb[1]}_reg{comb[2]}_chan{comb[3]}.csv'
            filepath = os.path.join(output_directory, filename)
            subset.to_csv(filepath, index=False)
            print(f"Saved: {filepath}")

brands = june_2022_data['brand_index'].unique()
locations = june_2022_data['Point of Sale Country (Customer Location)'].unique()
regions = june_2022_data['Point of Sale Country (Region Location)'].unique()
channels = june_2022_data['Marketing Sub Channel'].unique()

combinations = list(product(brands, locations, regions, channels))

output_directory = r'C:\Users\Alavis1\Documents\Post doc\Works\DataSubsets'

save_data_subsets(june_2022_data, combinations, output_directory)

In [None]:
def generate_scatter_plots(data, combinations, output_pdf_path):
    with PdfPages(output_pdf_path) as pdf:
        for comb in combinations:
            plt.figure(figsize=(10, 6))
            subset = data[(data['brand_index'] == comb[0]) & 
                          (data['Point of Sale Country (Customer Location)'] == comb[1]) &
                          (data['Point of Sale Country (Region Location)'] == comb[2]) &
                          (data['Marketing Sub Channel'] == comb[3])]
            if subset.empty:
                plt.close()
                continue
            
            subset = subset.sort_values(by='$ Advertising Cost')
            
            X = subset['$ Advertising Cost']
            y = subset['$ Profit']
            X = sm.add_constant(X)  
            model = sm.OLS(y, X).fit()
            
            predictions = model.get_prediction(X)
            pred_df = predictions.summary_frame(alpha=0.05)  # 95% CI

            plt.scatter(subset['$ Advertising Cost'], subset['$ Profit'], s=100)
            plt.plot(subset['$ Advertising Cost'], pred_df['mean'], color='blue')
            plt.fill_between(subset['$ Advertising Cost'], pred_df['mean_ci_lower'], pred_df['mean_ci_upper'], color='blue', alpha=0.2)
            plt.fill_between(subset['$ Advertising Cost'], pred_df['obs_ci_lower'], pred_df['obs_ci_upper'], color='green', alpha=0.2)
            plt.title(f'Profit vs. Cost - Brand: {comb[0]}, Location: {comb[1]}, Region: {comb[2]}, Channel: {comb[3]}')
            plt.xlabel('Advertising Cost')
            plt.ylabel(' Profit')
            plt.grid(True)
            pdf.savefig()
            plt.close()

output_pdf_path = r'C:\Users\Alavis1\Documents\Post doc\Works\Visuals\all_combinations_plots_CI2.pdf'
generate_scatter_plots(june_2022_data, combinations, output_pdf_path)
