# Threshold Adjustment

## Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import precision_recall_curve

In [2]:
data = pd.read_csv('data/ML_Player_performance.csv')
data.head()

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers,target_5y
0,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0
1,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,2.6,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0
2,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,0.9,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0
3,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1
4,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1


# Preprocessing

In [3]:
X = data.drop(columns=["target_5y"])
y = data["target_5y"]
rob_scaler = RobustScaler()
X_scaled = rob_scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns = X.columns)
X_scaled.head()

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers
0,-0.9,0.933884,0.352941,0.25,0.666667,-1.206557,1.0,1.5,0.083077,0.585366,0.571429,-0.109375,-0.1,1.0625,0.659794,0.571429,-0.2,0.5,0.375
1,-0.933333,0.892562,0.313725,-0.05,0.452381,-1.87541,1.5,2.083333,0.036923,1.560976,1.357143,0.40625,-0.3,0.1875,-0.041237,1.857143,1.2,0.75,0.75
2,0.366667,-0.066116,-0.078431,-0.05,-0.02381,-0.222951,0.75,1.166667,0.064615,-0.097561,-0.142857,-0.335937,-0.3,0.0,-0.123711,-0.071429,0.0,0.25,0.0
3,-0.166667,-0.371901,0.019608,0.1,0.166667,-0.170492,0.0,0.166667,0.009231,-0.097561,-0.142857,-0.1875,0.2,-0.5,-0.247423,-0.214286,0.2,-0.25,0.0
4,-0.5,-0.380165,-0.215686,-0.25,-0.428571,1.114754,-0.25,-0.166667,-0.686154,0.292683,0.285714,-0.304687,0.2,-0.125,0.0,-0.571429,-0.4,0.5,-0.25


# Base modeling

In [4]:
log_model = LogisticRegression()
base_score = cross_val_score(log_model, X_scaled, y, cv = 10).mean()
base_score

0.7063112326270221

# Threshold adjustment

In [5]:
y_pred_1, y_pred_2 = cross_val_predict(log_model, X_scaled, y, cv = 10, method="predict_proba").T
y_pred_2

array([0.2193943 , 0.32034075, 0.53031123, ..., 0.61981648, 0.46341244,
       0.35829299])

In [6]:
precision, recall, thresholds = precision_recall_curve(y, y_pred_2)

In [7]:
my_dict = {
    "precision": precision[:-1],
    "recall": recall[:-1],
    "thresholds": thresholds
}
my_dict

{'precision': array([0.62198795, 0.62245667, 0.62292609, ..., 1.        , 1.        ,
        1.        ]),
 'recall': array([1.        , 1.        , 1.        , ..., 0.00363196, 0.00242131,
        0.00121065]),
 'thresholds': array([0.03757317, 0.07455231, 0.08870155, ..., 0.9903215 , 0.99134218,
        0.99493358])}

In [8]:
df = pd.DataFrame(my_dict)
df = df.loc[df["precision"] > 0.9].sort_values("thresholds", ascending=True)
new_threshold = df["thresholds"].min()
new_threshold

0.8577690187248399

# Using the new threshold

In [9]:
new_player = pd.read_csv("data/ML_New_player.csv")
new_player

Unnamed: 0,games played,minutes played,points per game,field goals made,field goal attempts,field goal percent,3 point made,3 point attempt,3 point %,free throw made,free throw attempts,free throw %,offensive rebounds,defensive rebounds,rebounds,assists,steals,blocks,turnovers
0,80,31.4,14.3,5.9,11.1,52.5,0.0,0.1,11.1,2.6,3.9,65.4,3.0,5.0,8.0,2.4,1.1,0.8,2.2


In [10]:
model = log_model.fit(X_scaled, y)
model.predict(new_player)

array([1], dtype=int64)