In [24]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score                    # Imported libraries
data = pd.read_csv("Regression Model Data +Tech - 2024 Draft Combine Stats (4).csv")
data.replace("-", pd.NA, inplace=True)
data["Shuttle Run"] = pd.to_numeric(data["Shuttle Run"])
data["Three Quarter Sprint"] = pd.to_numeric(data["Three Quarter Sprint"])    # Changed datatypes to numeric to proceed with analysis
data = data.dropna(subset = ["Shuttle Run", "Three Quarter Sprint"])          # Removed data points that had null value in order to proceed with analysis
data.head()

Unnamed: 0,PLAYER,POS,Height,Weight,Wingspan,Standing Reach,Hand Length,Hand Width,Lane Agility Time,Shuttle Run,Three Quarter Sprint,Standing Vertical Leap,Max Vertical Leap,PPG,Free Throw %,Three Point %,Two Point %,EFG
0,Michael Ajayi,SF,77.75,227.8,84.75,106.5,9.5,9.75,11.77,3.17,3.21,27.0,34.5,17.2,70.9,47.0,46.6,51.1
1,Melvin Ajinca,SF,79.25,213.8,79.5,106.5,8.5,9.75,11.78,3.0,3.38,26.5,31.0,9.3,79.7,30.9,50.7,47.8
2,Trey Alexander,SG,75.25,187.0,82.5,101.5,8.75,10.0,11.37,2.92,3.21,26.5,32.0,17.6,82.4,33.9,50.4,50.6
3,Izan Almansa,C,81.25,219.6,85.0,110.0,9.25,9.25,10.94,2.96,3.11,28.0,35.0,8.2,37.9,22.2,55.2,53.3
4,Reece Beekman,PG,73.25,196.4,78.5,100.0,8.5,9.0,10.86,3.08,2.94,30.0,36.5,14.3,75.4,31.0,48.7,48.2


In [25]:
position_map = {
    "PG": "Guard", 
    "SG": "Guard",
    "SF": "Forward", 
    "PF": "Forward", 
    "C": "Center"
    }

data["POS"] = data["POS"].replace(position_map)         # Changed the format of the position to column to group players as Guards, Forwards, Centers
data.head()

Unnamed: 0,PLAYER,POS,Height,Weight,Wingspan,Standing Reach,Hand Length,Hand Width,Lane Agility Time,Shuttle Run,Three Quarter Sprint,Standing Vertical Leap,Max Vertical Leap,PPG,Free Throw %,Three Point %,Two Point %,EFG
0,Michael Ajayi,Forward,77.75,227.8,84.75,106.5,9.5,9.75,11.77,3.17,3.21,27.0,34.5,17.2,70.9,47.0,46.6,51.1
1,Melvin Ajinca,Forward,79.25,213.8,79.5,106.5,8.5,9.75,11.78,3.0,3.38,26.5,31.0,9.3,79.7,30.9,50.7,47.8
2,Trey Alexander,Guard,75.25,187.0,82.5,101.5,8.75,10.0,11.37,2.92,3.21,26.5,32.0,17.6,82.4,33.9,50.4,50.6
3,Izan Almansa,Center,81.25,219.6,85.0,110.0,9.25,9.25,10.94,2.96,3.11,28.0,35.0,8.2,37.9,22.2,55.2,53.3
4,Reece Beekman,Guard,73.25,196.4,78.5,100.0,8.5,9.0,10.86,3.08,2.94,30.0,36.5,14.3,75.4,31.0,48.7,48.2


In [26]:
guards = data[data["POS"] == "Guard"]
guards.head()

Unnamed: 0,PLAYER,POS,Height,Weight,Wingspan,Standing Reach,Hand Length,Hand Width,Lane Agility Time,Shuttle Run,Three Quarter Sprint,Standing Vertical Leap,Max Vertical Leap,PPG,Free Throw %,Three Point %,Two Point %,EFG
2,Trey Alexander,Guard,75.25,187.0,82.5,101.5,8.75,10.0,11.37,2.92,3.21,26.5,32.0,17.6,82.4,33.9,50.4,50.6
4,Reece Beekman,Guard,73.25,196.4,78.5,100.0,8.5,9.0,10.86,3.08,2.94,30.0,36.5,14.3,75.4,31.0,48.7,48.2
9,Carlton Carrington,Guard,75.75,194.8,80.0,99.0,8.5,9.25,11.28,2.99,3.16,28.5,36.5,13.8,78.5,32.2,51.1,49.6
10,Devin Carter,Guard,74.25,193.0,80.75,98.0,8.75,9.0,10.63,2.9,2.87,35.0,42.0,19.7,74.9,37.7,56.3,56.4
11,Stephon Castle,Guard,77.5,210.0,81.0,102.0,8.75,8.75,10.93,2.91,3.19,28.5,37.0,11.1,75.5,26.7,54.4,50.7


In [27]:
guards.loc[:, "Lane Agility Time"] = 13 - guards["Lane Agility Time"]
guards.loc[:, "Shuttle Run"] = 3.75 - guards["Shuttle Run"]
guards.loc[:, "Three Quarter Sprint"] = 3.75 - guards["Three Quarter Sprint"]

X = guards[["Lane Agility Time", "Shuttle Run", "Three Quarter Sprint", "Standing Vertical Leap", "Max Vertical Leap"]]
y = guards["Free Throw %"]

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)

coefficients = pd.DataFrame(model.coef_, X.columns, columns = ["Coefficient"])

print(r2)
print(coefficients)


0.0754864613730728
                        Coefficient
Lane Agility Time          5.179953
Shuttle Run               -3.702730
Three Quarter Sprint      -2.600965
Standing Vertical Leap     0.480515
Max Vertical Leap         -0.895149


### The above takes the variables lane agility, shuttle run, and three quarter sprint and flips them.  This is because these variables are timed variables which means that those with a lower time are preferred.  However, when you run a regression on these variables, you will get a negative relationship since the lower the time will correlate with greater performance.  Thus, I adjusted the data so that it would only reflect positive relationships, similar to the Standing Vert and Max Vert.

### After that I did a regression, and found the coefficients for each variable.  A larger coefficient implicates a stronger correlation between that variable and the Y variable, whereas a lesser coefficient implicates the opposite. This above analysis only concerns Guards.