In [None]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
from sklearn.metrics import mean_squared_error
from scipy.optimize import curve_fit
from matplotlib.backends.backend_pdf import PdfPages
import os

In [None]:
file_path = r'/Users/behnam/Library/CloudStorage/OneDrive-McMasterUniversity/Works/expedia_data_for_macc_without_index_names_2024- revised.csv'
base = pd.read_csv(file_path)
base['FISCAL_DATE'] = pd.to_datetime(base['FISCAL_DATE'])
basic = base.set_index('FISCAL_DATE')
basic

In [None]:
def linear_regression(X, y):
    model = LinearRegression()
    model.fit(X.reshape(-1, 1), y)
    return model

def ransac_regression(X, y):
    model = RANSACRegressor(LinearRegression())
    model.fit(X.reshape(-1, 1), y)
    return model

def generalized_logistic(X, a, b, c, d, v):
    return a + (c - a) / (1 + np.exp(b * (d - X))**(1/v))

def gompertz_curve(X, a, b, c, d):
    return a + (b + np.exp(-c * np.exp(-d * X)))

def exponential_rise(X, a, b, c):
    return a - (b + np.exp(-c * X))

def michaelis_menten(X, a, b):
    return (a * X) / (b + X)

def logarithmic_function(X, a, b, c):
    return a + b * np.log(X - c)

def fit_and_predict(X_sorted, y_sorted, method):
    if method == 'linear_regression':
        model = linear_regression(X_sorted, y_sorted)
        return model.predict(X_sorted.reshape(-1, 1))
    elif method == 'ransac_regression':
        model = ransac_regression(X_sorted, y_sorted)
        return model.predict(X_sorted.reshape(-1, 1))
    elif method == 'generalized_logistic':
        p0 = [np.min(y_sorted), 1, np.max(y_sorted), np.median(X_sorted), 1]  # Reasonable initial guess
        bounds = (0, [np.inf, np.inf, np.inf, np.inf, np.inf])
        popt, _ = curve_fit(generalized_logistic, X_sorted, y_sorted, p0=p0, bounds=bounds, maxfev=10000)
        return generalized_logistic(X_sorted, *popt)
    elif method == 'gompertz_curve':
        p0 = [np.min(y_sorted), 0.1, 0.1, 0.1]  
        bounds = (0, [np.inf, np.inf, np.inf, np.inf])
        popt, _ = curve_fit(gompertz_curve, X_sorted, y_sorted, p0=p0, bounds=bounds, maxfev=10000)
        return gompertz_curve(X_sorted, *popt)
    elif method == 'exponential_rise':
        p0 = [np.max(y_sorted), 1, 1]  
        bounds = (0, [np.inf, np.inf, np.inf])
        popt, _ = curve_fit(exponential_rise, X_sorted, y_sorted, p0=p0, bounds=bounds, maxfev=10000)
        return exponential_rise(X_sorted, *popt)
    elif method == 'michaelis_menten':
        p0 = [np.max(y_sorted), np.median(X_sorted)]  
        bounds = (0, [np.inf, np.inf])
        popt, _ = curve_fit(michaelis_menten, X_sorted, y_sorted, p0=p0, bounds=bounds, maxfev=10000)
        return michaelis_menten(X_sorted, *popt)
    elif method == 'logarithmic_function':
        p0 = [np.min(y_sorted), 1, np.median(X_sorted)]  
        bounds = (0, [np.inf, np.inf, np.inf])
        popt, _ = curve_fit(logarithmic_function, X_sorted, y_sorted, p0=p0, bounds=bounds, maxfev=10000)
        return logarithmic_function(X_sorted, *popt)

aggregated_mse_path = r'C:\Users\Alavis1\Documents\Works\Budgetsets2\regression_results\final_aggregated_mse_results.csv'
best_methods_df = pd.read_csv(aggregated_mse_path)

subset_directory = r'C:\Users\Alavis1\Documents\Works\Budgetsets2'

max_x, max_y = -np.inf, -np.inf

for index, row in best_methods_df.iterrows():
    file_base = row['base_filename']
    for month in range(1, 13):
        monthly_file = f'{file_base}_monthly_{pd.Timestamp(f"2024-{month:02d}-01").strftime("%B")}.csv'
        file_path = os.path.join(subset_directory, monthly_file)
        if not os.path.exists(file_path):
            continue

        monthly_data = pd.read_csv(file_path)
        if monthly_data.empty:
            continue

        X = monthly_data['$ Advertising Cost'].values
        y = monthly_data['$ Profit'].values

        max_x = max(max_x, X.max())
        max_y = max(max_y, y.max())

# Generate the PDF with subplots
for index, row in best_methods_df.iterrows():
    file_base = row['base_filename']
    best_method = row['best_method']
    second_best_method = row['second_best_method']  

    pdf_path = os.path.join(subset_directory, f'{file_base}_regression_plots.pdf')
    with PdfPages(pdf_path) as pdf:
        fig, axes = plt.subplots(12, 2, figsize=(20, 60))  

        for month in range(1, 13):
            monthly_file = f'{file_base}_monthly_{pd.Timestamp(f"2024-{month:02d}-01").strftime("%B")}.csv'
            file_path = os.path.join(subset_directory, monthly_file)
            if not os.path.exists(file_path):
                continue

            monthly_data = pd.read_csv(file_path)
            if monthly_data.empty:
                continue

            X = monthly_data['$ Advertising Cost'].values
            y = monthly_data['$ Profit'].values
            fiscal_dates = monthly_data['FISCAL_DATE']

            # Extract year from FISCAL_DATE
            years = pd.to_datetime(fiscal_dates).dt.year

            sorted_indices = np.argsort(X)
            X_sorted = X[sorted_indices]
            y_sorted = y[sorted_indices]
            years_sorted = years[sorted_indices]

            unique_years = np.unique(years_sorted)
            colors_list = ['blue', 'green', 'yellow', 'brown']
            colors = {year: colors_list[i % len(colors_list)] for i, year in enumerate(unique_years)}

            row_idx = month - 1
            col_idx_left = 0
            col_idx_right = 1

            ax_left = axes[row_idx, col_idx_left]
            ax_right = axes[row_idx, col_idx_right]

            for year in unique_years:
                mask = (years_sorted == year)
                ax_left.scatter(X_sorted[mask], y_sorted[mask], color=colors[year], label=f'Data {year}' if month == 1 else "")
                ax_right.scatter(X_sorted[mask], y_sorted[mask], color=colors[year], label=f'Data {year}' if month == 1 else "")

            if month == 4:
                y_pred_best_april = fit_and_predict(X_sorted, y_sorted, best_method)
                y_pred_best_april = np.maximum(y_pred_best_april, 0) 
                mse_april = mean_squared_error(y_sorted, y_pred_best_april)
                ax_left.plot(X_sorted, y_pred_best_april, color='red', label=f'{best_method.replace("_", " ").title()} (April)')
                ax_right.plot(X_sorted, y_pred_best_april, color='red', label=f'{best_method.replace("_", " ").title()} (April)')
                ax_left.text(0.05, 0.95, f'MSE April: {mse_april:,.2f}', transform=ax_left.transAxes, fontsize=12, verticalalignment='top')
                may_file = f'{file_base}_monthly_May.csv'
                may_file_path = os.path.join(subset_directory, may_file)
                if os.path.exists(may_file_path):
                    may_data = pd.read_csv(may_file_path)
                    if not may_data.empty:
                        X_may = may_data['$ Advertising Cost'].values
                        y_may = may_data['$ Profit'].values
                        sorted_indices_may = np.argsort(X_may)
                        X_sorted_may = X_may[sorted_indices_may]
                        y_sorted_may = y_may[sorted_indices_may]
                        y_pred_best_may = fit_and_predict(X_sorted_may, y_sorted_may, best_method)
                        y_pred_best_may = np.maximum(y_pred_best_may, 0)  
                        mse_may = mean_squared_error(y_sorted_may, y_pred_best_may)
                        ax_left.plot(X_sorted_may, y_pred_best_may, color='blue', linestyle='--', label=f'{best_method.replace("_", " ").title()} (May)')
                        ax_right.plot(X_sorted_may, y_pred_best_may, color='blue', linestyle='--', label=f'{best_method.replace("_", " ").title()} (May)')
                        ax_left.text(0.05, 0.85, f'MSE May: {mse_may:,.2f}', transform=ax_left.transAxes, fontsize=12, verticalalignment='top')

            if month == 12:
                y_pred_best_december = fit_and_predict(X_sorted, y_sorted, best_method)
                y_pred_best_december = np.maximum(y_pred_best_december, 0)  
                mse_december = mean_squared_error(y_sorted, y_pred_best_december)
                ax_left.plot(X_sorted, y_pred_best_december, color='red', label=f'{best_method.replace("_", " ").title()} (December)')
                ax_right.plot(X_sorted, y_pred_best_december, color='red', label=f'{best_method.replace("_", " ").title()} (December)')
                ax_left.text(0.05, 0.95, f'MSE December: {mse_december:,.2f}', transform=ax_left.transAxes, fontsize=12, verticalalignment='top')
                november_file = f'{file_base}_monthly_November.csv'
                november_file_path = os.path.join(subset_directory, november_file)
                if os.path.exists(november_file_path):
                    november_data = pd.read_csv(november_file_path)
                    if not november_data.empty:
                        X_november = november_data['$ Advertising Cost'].values
                        y_november = november_data['$ Expedia Profit'].values
                        sorted_indices_november = np.argsort(X_november)
                        X_sorted_november = X_november[sorted_indices_november]
                        y_sorted_november = y_november[sorted_indices_november]
                        y_pred_best_november = fit_and_predict(X_sorted_november, y_sorted_november, best_method)
                        y_pred_best_november = np.maximum(y_pred_best_november, 0)  
                        mse_november = mean_squared_error(y_sorted_november, y_pred_best_november)
                        ax_left.plot(X_sorted_november, y_pred_best_november, color='blue', linestyle='--', label=f'{best_method.replace("_", " ").title()} (November)')
                        ax_right.plot(X_sorted_november, y_pred_best_november, color='blue', linestyle='--', label=f'{best_method.replace("_", " ").title()} (November)')
                        ax_left.text(0.05, 0.85, f'MSE November: {mse_november:,.2f}', transform=ax_left.transAxes, fontsize=12, verticalalignment='top')

            y_pred_best = fit_and_predict(X_sorted, y_sorted, best_method)
            y_pred_best = np.maximum(y_pred_best, 0) 
            ax_left.plot(X_sorted, y_pred_best, color='red', label=best_method.replace('_', ' ').title())
            ax_left.set_xlim(0, max_x)
            ax_left.set_ylim(0, max_y)
            ax_left.set_title(f'{file_base} - {pd.Timestamp(f"2024-{month:02d}-01").strftime("%B")} (Best)')
            ax_left.set_xlabel('$ Advertising Cost')
            ax_left.set_ylabel('$ Profit')
            ax_left.legend()
            ax_left.grid(True)

            y_pred_second_best = fit_and_predict(X_sorted, y_sorted, second_best_method)
            y_pred_second_best = np.maximum(y_pred_second_best, 0)  
            ax_right.plot(X_sorted, y_pred_second_best, color='green', label=second_best_method.replace('_', ' ').title())
            ax_right.set_xlim(0, max_x)
            ax_right.set_ylim(0, max_y)
            ax_right.set_title(f'{file_base} - {pd.Timestamp(f"2024-{month:02d}-01").strftime("%B")} (Second Best)')
            ax_right.set_xlabel('$ Advertising Cost')
            ax_right.set_ylabel('$ Profit')
            ax_right.legend()
            ax_right.grid(True)

            mse_best = mean_squared_error(y_sorted, y_pred_best)
            mse_second_best = mean_squared_error(y_sorted, y_pred_second_best)
            ax_left.text(0.05, 0.05, f'MSE Best: {mse_best:,.2f}', transform=ax_left.transAxes, fontsize=12, verticalalignment='bottom')
            ax_right.text(0.05, 0.05, f'MSE Second Best: {mse_second_best:,.2f}', transform=ax_right.transAxes, fontsize=12, verticalalignment='bottom')

        plt.tight_layout()
        pdf.savefig(fig)
        plt.close(fig)
print("PDF with scatter plots and regression lines has been generated.")
