In [313]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from datetime import datetime, timezone

import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from models import line_of_best_fit, linreg_predict
from fetcher import fetch_scores

In [441]:
def crossval_model(X, y, size=0.3):
  """ Cross validates a potential model we have devised and returns its Mean-Squared Error and R2 values.

  Params:
    X (matrix): The X-features for the projected model
    y (array):  The true y-values

  Returns:
    str (string): A string showing the MSE and R2 values of the model with cross validation.
  """

  crossval = train_test_split(X, y, test_size=size, random_state=3)
  Xtrain, Xtest, ytrain, ytest = crossval

  model = line_of_best_fit(Xtrain, ytrain)
  preds = linreg_predict(Xtest, ytest, model)
  return f"MSE: {preds["mse"]:.16f}, R2: {preds["r2"]:.16f}"

In [442]:
id = "bytesy"
scores_df = fetch_scores(id)

In [443]:
scores_df["dateset"] = pd.to_datetime(scores_df["dateset"])

In [444]:
max_date = scores_df["dateset"].max()
scores_df["weeks_since"] = (max_date - scores_df["dateset"]).dt.days / 7
scores_df["weeks_since"].head()

0    30.714286
1     9.000000
2    53.571429
3    12.285714
4    74.000000
Name: weeks_since, dtype: float64

In [445]:
lambda_value = 0.1
scores_df["decay_weight"] = np.exp(-lambda_value * scores_df["weeks_since"] / 2)

In [446]:
X = scores_df[["mod_passRating", "mod_accRating", "mod_techRating"]].values.reshape(-1, 3)
X_poly = PolynomialFeatures(degree=2).fit_transform(X)
y = scores_df["accuracy"].to_numpy().reshape(-1, 1)
y_inv = (1 - scores_df["accuracy"]).to_numpy().reshape(-1, 1)
y_inv_log = np.log(y_inv)
weights = scores_df["decay_weight"].values

In [447]:
model = LinearRegression()
model.fit(X_poly, y, sample_weight=weights)

pred_accuracies = model.predict(X_poly)
scores_df["pred_accuracy"] = pred_accuracies

In [448]:
model = LinearRegression()
model.fit(X_poly, y_inv_log, sample_weight=weights)

pred_accuracies = model.predict(X_poly)
pred_accuracies = 1 - np.exp(pred_accuracies)
scores_df["pred_accuracy"] = pred_accuracies

In [449]:
# current_date = datetime.now()
# scores_df["days_since"] = (current_date - scores_df["dateset"]).dt.days
# scores_df["days_since"]

In [450]:
# now = pd.Timestamp.now()
# decay = 1000 * 60 * 60 * 24 * 7    # 2 weeks
# alpha = 3                          # penalty scaling
# max_days = 912 
# target_stars = 6

In [451]:
# # scores_df["time_weight"] = 1 + (now - scores_df["dateset"]).dt.total_seconds() / decay

# # scores_df["time_weight"] = 1 + (np.maximum((current_date - scores_df["dateset"]).dt.days, 0) / decay)
# # scores_df["distance"] = 2 * np.abs(target_stars - scores_df["stars"])
# # scores_df["front"] = np.where(target_stars > scores_df["stars"], scores_df["distance"] ** 3, 1)
# # scores_df["weight"] = 1 / (1 + scores_df["distance"] * scores_df["time_weight"] * scores_df["front"])

# scores_df["time_weight"] = 1 + (now - scores_df["dateset"]).dt.total_seconds() / decay
# scores_df["distance"] = 2 * np.abs(scores_df["stars"] - scores_df["stars"].mean())  # Compare to mean stars value
# scores_df["front"] = np.where(scores_df["stars"] > scores_df["stars"].mean(), scores_df["distance"] ** 3, 1)
# scores_df["weight"] = 1 / (1 + scores_df["distance"] * scores_df["time_weight"] * scores_df["front"])

In [452]:
# from sklearn.linear_model import LinearRegression

# model = LinearRegression()
# model.fit(X, y, sample_weight=weights)

In [453]:
# from sklearn.preprocessing import PolynomialFeatures

# poly = PolynomialFeatures(degree=2)
# X_poly = poly.fit_transform(X)

# model = LinearRegression()
# model.fit(X_poly, y, sample_weight=weights)

In [454]:
# pred_accuracies = model.predict(X_poly)

# # Add predicted scores to the dataframe
# scores_df["pred_accuracy"] = pred_accuracies

In [455]:
px.scatter(scores_df, x="stars", y="accuracy", color="decay_weight", hover_data=["weeks_since"])

In [456]:
# Create the accuracy_type column for actual values
scores_df['accuracy_type'] = 'Actual'

# Create a new column for predicted values
predicted_df = scores_df[['stars', 'pred_accuracy']].copy()
predicted_df['accuracy_type'] = 'Predicted'

# Rename the predicted column to match the actual column
predicted_df = predicted_df.rename(columns={'pred_accuracy': 'accuracy'})

# Combine the actual and predicted DataFrames
combined_df = pd.concat([scores_df[['stars', 'accuracy', 'accuracy_type']], predicted_df[['stars', 'accuracy', 'accuracy_type']]], ignore_index=True)

# Create the plot
fig = px.scatter(
    combined_df, 
    x="stars", 
    y="accuracy",  # Use the 'score' column for both actual and predicted
    color="accuracy_type",  # Color points by 'accuracy_type'
    labels={"score": "Accuracy", "stars": "Stars"},  # Axis labels
    title="Actual vs Predicted Accuracy for Maps"
)

# Show the plot
fig.show()