In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')


In [34]:
from sklearn.ensemble import IsolationForest

# Load player data with career stats
player_reg_season_df = pd.read_csv('player_regular_season.txt')
player_career_df = pd.read_csv('player_regular_season_career.txt')

# Select player features for performance analysis
player_reg_season_df = player_reg_season_df[['ilkid', 'firstname', 'lastname', 'pts', 'reb', 'asts', 'stl', 'blk']]
player_career_df = player_career_df[['ilkid', 'pts', 'reb', 'asts', 'stl', 'blk']]
player_data = pd.merge(player_reg_season_df, player_career_df, on='ilkid', suffixes=('_season', '_career'))

# Relevant performance features for outlier detection
player_features = player_data[['pts_season', 'reb_season', 'asts_season', 'stl_season', 'blk_season',
                               'pts_career', 'reb_career', 'asts_career', 'stl_career', 'blk_career']]

# Handle missing values and standardize features
player_features.fillna(0, inplace=True)
scaler = StandardScaler()
player_features_scaled = scaler.fit_transform(player_features)

# Fit Isolation Forest for outlier detection
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(player_features_scaled)
player_data['outstanding'] = outliers

# Filter and display outstanding players
outstanding_players = player_data[player_data['outstanding'] == -1]
print("Outstanding Players:\n", outstanding_players[['firstname', 'lastname', 'pts_season', 'reb_season', 'asts_season',
                                                      'pts_career', 'reb_career', 'asts_career']])


Outstanding Players:
        firstname lastname  pts_season  reb_season  asts_season  pts_career  \
1009       Dolph  Schayes        1121        1080          251       18438   
1252       Dolph  Schayes        1262         920          227       18438   
1453         Bob   Pettit        1466         994          229       20880   
1498         Bob    Cousy        1356         492          642       16960   
1548         Bob   Pettit        1849        1164          189       20880   
...          ...      ...         ...         ...          ...         ...   
20338    Dikembe  Mutombo         322         426           10       11196   
20362  Shaquille   O'Neal        1669         760          200       23583   
20376       Gary   Payton         873         236          469       20829   
20503     Dwyane     Wade        1854         397          520        2845   
20509        Ben  Wallace         721         902          123        4029   

       reb_career  asts_career  
1009    

In [24]:
# Load the team season stats data
team_season_df = pd.read_csv('team_season.txt')

# Selecting features that might be predictive of game outcomes
features = ['o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_oreb', 'o_dreb', 'o_reb', 'o_asts',
            'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_3pm', 'o_3pa', 'o_pts', 'd_fgm', 'd_fga',
            'd_ftm', 'd_fta', 'd_oreb', 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to',
            'd_blk', 'd_3pm', 'd_3pa', 'd_pts']

# Add the outcome label: 1 if the team won more games than lost, 0 otherwise
team_season_df['win_loss'] = np.where(team_season_df['won'] > team_season_df['lost'], 1, 0)

# Define feature matrix X and target variable y
X = team_season_df[features]
y = team_season_df['win_loss']

# Standardizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [25]:
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)


In [26]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)


In [27]:

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.7142857142857143
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.69      0.71       120
           1       0.70      0.74      0.72       118

    accuracy                           0.71       238
   macro avg       0.71      0.71      0.71       238
weighted avg       0.71      0.71      0.71       238



In [28]:

svm_classifier = SVC(kernel='rbf', random_state=42)
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.6722689075630253
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.66      0.67       120
           1       0.66      0.69      0.68       118

    accuracy                           0.67       238
   macro avg       0.67      0.67      0.67       238
weighted avg       0.67      0.67      0.67       238



In [29]:
nn_classifier = MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=42)
nn_classifier.fit(X_train, y_train)
y_pred_nn = nn_classifier.predict(X_test)
print("Neural Network Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Neural Network Classification Report:\n", classification_report(y_test, y_pred_nn))


Neural Network Accuracy: 0.7184873949579832
Neural Network Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.70      0.71       120
           1       0.71      0.74      0.72       118

    accuracy                           0.72       238
   macro avg       0.72      0.72      0.72       238
weighted avg       0.72      0.72      0.72       238



In [30]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.6428571428571429
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.60      0.63       120
           1       0.63      0.69      0.66       118

    accuracy                           0.64       238
   macro avg       0.64      0.64      0.64       238
weighted avg       0.64      0.64      0.64       238



In [31]:
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.6176470588235294
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.62      0.62       120
           1       0.62      0.61      0.61       118

    accuracy                           0.62       238
   macro avg       0.62      0.62      0.62       238
weighted avg       0.62      0.62      0.62       238



In [32]:
# Voting Classifier (Ensemble of different models)
voting_classifier = VotingClassifier(
    estimators=[
        ('rf', rf_classifier),
        ('svm', svm_classifier),
        ('nn', nn_classifier),
        ('nb', nb_classifier),
        ('dt', dt_classifier)
    ],
    voting='hard'
)
voting_classifier.fit(X_train, y_train)
y_pred_voting = voting_classifier.predict(X_test)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_voting))
print("Voting Classifier Classification Report:\n", classification_report(y_test, y_pred_voting))


Voting Classifier Accuracy: 0.6974789915966386
Voting Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.69      0.70       120
           1       0.69      0.70      0.70       118

    accuracy                           0.70       238
   macro avg       0.70      0.70      0.70       238
weighted avg       0.70      0.70      0.70       238



In [33]:
# Print a summary table of accuracy scores
accuracy_scores = {
    "Random Forest": accuracy_score(y_test, y_pred_rf),
    "SVM": accuracy_score(y_test, y_pred_svm),
    "Neural Network": accuracy_score(y_test, y_pred_nn),
    "Naive Bayes": accuracy_score(y_test, y_pred_nb),
    "Decision Tree": accuracy_score(y_test, y_pred_dt),
    "Voting Classifier": accuracy_score(y_test, y_pred_voting),
}

print("\nModel Comparison:")
for model, accuracy in accuracy_scores.items():
    print(f"{model}: {accuracy:.2f}")



Model Comparison:
Random Forest: 0.71
SVM: 0.67
Neural Network: 0.72
Naive Bayes: 0.64
Decision Tree: 0.62
Voting Classifier: 0.70
