In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import pandas as pd

In [None]:
product_df = pd.read_csv("Product Wise ARR Report - Sheet5 (1).csv").drop(['Subscription Product ARR (converted)'], axis=1)
product_df

In [None]:
product_df = product_df.groupby(['Account Name', 'Product Name']).apply(lambda x: pd.Series({'Total Quantity': x['Quantity'].sum(),
    'Weighted Average Price': (x['Quantity'] * x['Offered Price (converted)']).sum() / x['Quantity'].sum() }))

product_df = product_df.reset_index()
product_df = product_df.drop(['Account Name'], axis=1)
product_df

In [None]:
product_df['Product Name'] = product_df['Product Name'].str.replace('Training Essentials ', 'Training Essentials', regex=False)

In [None]:
product_df.groupby(['Product Name']).size().reset_index(name="count")

# Polynomial Regularisation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, make_scorer

import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from scipy.stats import uniform, randint
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import MinMaxScaler
import os

# L2 - Ridge Regularization

### Hyperparameter tuning considering the number of variables as well

# Generalized Code

In [None]:
import os
os.makedirs('images', exist_ok=True)
os.makedirs('SVR_models', exist_ok=True)

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from scipy.stats import uniform, expon

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import uniform
import matplotlib.pyplot as plt
import joblib
import os

polynomial_degree_df = pd.DataFrame()

for grouped_value, grouped_df in product_df.groupby(['Product Name']):
    X = grouped_df['Total Quantity'].values.reshape(-1, 1)
    y = grouped_df['Weighted Average Price']

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    # Setting up SVR model
    svr_model = SVR()

    # Defining the parameter distribution for tuning
    param_dist = {
        'C': [0.1, 1, 10, 100],  # Discrete values for C
        'epsilon': [0.01, 0.1, 0.5, 1],  # Discrete values for epsilon
        'kernel': ['linear', 'poly', 'rbf']  # Can limit to fewer options
    }
    if 'poly' in param_dist['kernel']:
        param_dist['degree'] = [2, 3]  # Limiting the degree for polynomial kernel

    # Setting up RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(svr_model, param_distributions=param_dist, 
                                       n_iter=30, cv=5, random_state=0, n_jobs=-1)

    # Running the random search to find the best hyperparameters
    random_search.fit(X_train, y_train)

    # Training the model using the best parameters
    best_model = random_search.best_estimator_
    best_model.fit(X_train, y_train)

    # Predicting
    y_pred = best_model.predict(X_test)

    # Evaluating the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f'Mean Squared Error: {mse}')
    print(f'R² Score: {r2}')
    
    # Saving the model
    model_folder_path = 'SVR_models'
    if not os.path.exists(model_folder_path):
        os.makedirs(model_folder_path)
    model_filename = os.path.join(model_folder_path, f'SVR_model_{grouped_value}.joblib')
    joblib.dump(best_model, model_filename)    

    # Plotting
    plt.scatter(X_train, y_train, color='green', label='Training Data')
    plt.scatter(X_test, y_test, color='red', label='Testing Data')
    plt.scatter(X_test, y_pred, color='blue', label='Predicted Values')

    plt.xlabel('Total Quantity')
    plt.ylabel('Weighted Average Price')
    plt.title(f'SVR Model for {grouped_value}')    
    plt.legend()
    plot_file_path = f'images/{grouped_value}_SVR_plot.png'
    plt.savefig(plot_file_path)    
    plt.close()

    df = pd.DataFrame({
        "Product Name": [grouped_value],
        "Mean Squared Error": [mse],
        "R² Score": [r2],
        "Best Parameters": [random_search.best_params_],
        "Plot Image Path": [plot_file_path]
    })

    # Append this DataFrame to the main DataFrame
    polynomial_degree_df = pd.concat([polynomial_degree_df, df], ignore_index=True)

In [None]:
polynomial_degree_df

In [None]:
# Grouping by 'Product Name' and finding the index of minimum 'Mean Squared Error' for each group
idx = polynomial_degree_df.groupby('Product Name')['Mean Squared Error'].idxmin()
best_models_df = polynomial_degree_df.loc[idx]
best_models_df.reset_index(drop=True, inplace=True)
best_models_df

# Save the trained Models

In [None]:
import os
import joblib
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline

os.makedirs('Models', exist_ok=True)

In [None]:
for index, row in best_models_df.iterrows():
    product_name = row['Product Name']
    degree = row['Degree']
    alpha = row['Best Alpha']
    grouped_df = product_df[product_df['Product Name'] == product_name]

    X = grouped_df['Total Quantity'].values.reshape(-1, 1)
    y = grouped_df['Weighted Average Price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    model = make_pipeline(StandardScaler(), PolynomialFeatures(degree=degree), Ridge(alpha=alpha, random_state=0))
    model.fit(X_train, y_train)

    # Save the model
    filename = f'Models/model_{product_name}_degree_{degree}_alpha_{alpha:.4f}.joblib'
    joblib.dump(model, filename)

# Prediction Using the model

In [None]:
import os
import joblib

def predict_price(product_name, total_quantity):
    models_dir = 'Models'

    model_file = None
    for file in os.listdir(models_dir):
        if product_name in file:
            model_file = file
            break

    if model_file is None:
        return f"No model found for product: {product_name}"

    # Load the model
    model_path = os.path.join(models_dir, model_file)
    model = joblib.load(model_path)

    # Make a prediction
    predicted_price = model.predict([[total_quantity]])

    return predicted_price[0]

In [None]:
product_name = 'Virtual Role-Play (Missions + Quick Update)' 
total_quantity = 1700
predicted_price = predict_price(product_name, total_quantity)
print(f"Predicted Price: {predicted_price}")

In [None]:
product_df[product_df['Product Name'] == product_name].sort_values(['Weighted Average Price'])