In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split  # Import train_test_split function
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier  # Import Decision Tree Classifier
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("data/integrated.csv")

label = "playoff"
df = data
df['confID'].value_counts()

bins = 11  # Number of bins
# Use the cut function to perform binning
df['height'] = pd.cut(df['height'], bins=bins)
df['weight'] = pd.cut(df['weight'], bins=bins)

In [3]:
features = df.columns
toDrop = ["rank", "semis", "finals", "firstRound"]
for col in toDrop:
    features = features.drop(col)

arr = []

for col in df.columns:

    if "post" in col.lower():
        features = features.drop(col)
        arr.append(col)

assert len(features) < len(df.columns)

X = df[features]
df = X

In [4]:
le = LabelEncoder()
y = le.fit_transform(df['playoff'])
for x in df.columns:
    df[x] = le.fit_transform(df[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x] = le.fit_transform(df[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x] = le.fit_transform(df[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[x] = le.fit_transform(df[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [5]:
available_columns = list(features)

key_predictors = []
features_values = pd.DataFrame(df, columns=available_columns)
target_values = df["playoff"].values
chi2_results = chi2(features_values, target_values)
best_chi2_cols = SelectKBest(chi2, k=30)

best_chi2_cols.fit(features_values, target_values)

best_chi2_features = features_values.columns[best_chi2_cols.get_support()]

for k in (best_chi2_features):
    key_predictors.append(k)

key_predictors.append("playoff")
key_predictors.append("year")
key_predictors.append("confID")
df = df[key_predictors]
corr_matrix = df.corr()
#fig = plt.figure(figsize=(36,36), dpi = 480)
#sn.heatmap(df.corr(), annot = True, fmt = '.2f')
#plt.show()
print(len(df.columns))

33


In [6]:
high_correlation_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.91:
            feature1 = corr_matrix.columns[i]
            feature2 = corr_matrix.columns[j]
            correlation_value = corr_matrix.iloc[i, j]
            high_correlation_pairs.append((feature1, feature2, correlation_value))

# Print the high-correlation pairs
for feature1, feature2, correlation_value in high_correlation_pairs:
    print(f"Features: {feature1} and {feature2} have a correlation of {correlation_value:.2f}")
    if feature2 in key_predictors:
        key_predictors.remove(feature2)

df = df[key_predictors]


Features: o_reb and o_dreb have a correlation of 0.93
Features: o_pts and o_fgm have a correlation of 0.96
Features: d_fta and d_ftm have a correlation of 0.97
Features: lost_team and won_team have a correlation of -0.99
Features: homeL and homeW have a correlation of -0.99
Features: awayL and awayW have a correlation of -0.99
Features: confL and confW have a correlation of -0.95
Features: points and minutes have a correlation of 0.93
Features: fgAttempted and minutes have a correlation of 0.94
Features: fgAttempted and points have a correlation of 0.99


In [7]:
clf = DecisionTreeClassifier()

In [8]:
def split_data(df):
    return df[df['confID'] == 0], df[df['confID'] == 1]

In [9]:
def train_model_validation(year, model, data, label, param_grid):
    data['sampleWeight'] = data['year'].apply(lambda year_x: 2 ** (year - year_x - 1) if year > year_x else 1)
    train_data = data[data["year"] < year]
    test_data = data[data["year"] == year]

    X_train, X_val, y_train, y_val = train_test_split(train_data.drop([label], axis=1),
                                                      train_data[label], test_size=0.3, random_state=42)

    X_test, y_test = test_data.drop([label, 'sampleWeight'], axis=1), test_data[label]
    sample_weight = X_train['sampleWeight']
    model.fit(X_train.drop(['sampleWeight'], axis = 1), y_train, sample_weight=sample_weight)
    grid_search = GridSearchCV(model, param_grid, cv=None)
    grid_search.fit(X_val, y_val)

    #how to change the hyper parameters of the model
    sample_weight = train_data["sampleWeight"]
    model.set_params(**grid_search.best_params_)
    model.fit(train_data.drop([label, "sampleWeight"], axis=1), y_train, sample_weight=sample_weight)

    y_pred = model.predict(X_test)

    return y_test, y_pred


def train_model(year, model, data, label):
    data['sampleWeight'] = data['year'].apply(lambda year_x: 2 ** (year - year_x - 1) if year > year_x else 1)
    train_data = data[data["year"] < year]
    test_data = data[data["year"] == year]

    X_train, X_test = train_data.drop([label, 'sampleWeight'], axis=1), test_data.drop([label, 'sampleWeight'], axis=1)
    y_train, y_test = train_data[label], test_data[label]
    sample_weight = train_data['sampleWeight']

    model.fit(X_train, y_train, sample_weight=sample_weight)
    y_pred = model.predict(X_test)

    return y_test, y_pred


def train_evaluate_decision_tree_graph(model, data):
    accuracy_scores = []
    years_tested = []

    data = data.sort_values(by="year")
    data1, data2 = df[df['confID'] == 0], df[df['confID'] == 1]

    data1.drop(['confID'], axis=1, inplace=True)
    data2.drop(['confID'], axis=1, inplace=True)
    years = sorted(data["year"].unique())

    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    for year in years[2:]:
        y_test1, y_pred1 = train_model_validation(year, model, data1, label, param_grid)
        y_test2, y_pred2 = train_model_validation(year, model, data2, label, param_grid)

        y_test = np.concatenate([y_test1, y_test2])
        y_pred = np.concatenate([y_pred1, y_pred2])

        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)
        years_tested.append(year)

    return years_tested, accuracy_scores


train_evaluate_decision_tree_graph(clf, df)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1.drop(['confID'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2.drop(['confID'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['sampleWeight'] = data['year'].apply(lambda year_x: 2 ** (year - year_x - 1) if year > year_x else 1)


ValueError: Number of labels=149 does not match number of samples=214