In [63]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [64]:
# import data
df = pd.read_csv('sports_management_dataset.csv')
df.head(10)

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,Moderate,High,High,Moderate Engagement,Moderate,Moderate Impact,Moderate,Moderate,High Efficiency,Local,Community Development,Low,High Engagement,Moderate Efficiency,Health-Oriented
1,High,Moderate,High,Moderate Engagement,Low,Low Impact,Moderate,Low,Moderate Efficiency,National,Community Development,Moderate,Low Engagement,Moderate Efficiency,Recreational
2,High,High,High,Low Engagement,High,Moderate Impact,Moderate,Moderate,Moderate Efficiency,National,Community Development,High,Low Engagement,High Efficiency,Recreational
3,High,High,High,Moderate Engagement,Moderate,Moderate Impact,Moderate,Moderate,Moderate Efficiency,Regional,Community Development,High,Low Engagement,High Efficiency,Recreational
4,Moderate,High,Low,Low Engagement,Low,Moderate Impact,High,High,High Efficiency,Regional,Community Development,Low,Moderate Engagement,Moderate Efficiency,Community Development
5,Moderate,Moderate,High,Moderate Engagement,High,Low Impact,Moderate,Low,Moderate Efficiency,Regional,Community Development,High,Low Engagement,Moderate Efficiency,Community Development
6,Low,High,Moderate,Moderate Engagement,Low,Moderate Impact,Moderate,Moderate,High Efficiency,Regional,Youth-Focused,High,Moderate Engagement,Moderate Efficiency,Youth-Focused
7,High,High,Moderate,High Engagement,Low,High Impact,Low,Low,High Efficiency,National,Community Development,Moderate,High Engagement,Low Efficiency,Recreational
8,High,High,Moderate,High Engagement,Moderate,High Impact,Low,Low,Moderate Efficiency,Local,Community Development,Moderate,Low Engagement,Low Efficiency,Youth-Focused
9,High,High,High,Moderate Engagement,High,Moderate Impact,Low,Low,High Efficiency,Local,Recreational,Low,Moderate Engagement,Moderate Efficiency,Community Development


In [65]:
# get an overview of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102000 entries, 0 to 101999
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Energy Consumption           102000 non-null  object
 1   Carbon Emissions             102000 non-null  object
 2   Waste Generation             102000 non-null  object
 3   Community Engagement         102000 non-null  object
 4   Volunteer Participation      102000 non-null  object
 5   Health Impact                102000 non-null  object
 6   Water Usage                  102000 non-null  object
 7   Material Recycling Rate      102000 non-null  object
 8   Operational Cost Efficiency  102000 non-null  object
 9   Event Scale                  102000 non-null  object
 10  Event Focus                  102000 non-null  object
 11  Sustainability Score         102000 non-null  object
 12  Social Impact Level          102000 non-null  object
 13  Resource Effic

In [66]:
# check for number of unique values for each column to get an idea of how to encode the data
df.nunique()

Energy Consumption             3
Carbon Emissions               3
Waste Generation               3
Community Engagement           3
Volunteer Participation        3
Health Impact                  3
Water Usage                    3
Material Recycling Rate        3
Operational Cost Efficiency    3
Event Scale                    3
Event Focus                    4
Sustainability Score           3
Social Impact Level            3
Resource Efficiency            3
Event Type Classification      4
dtype: int64

In [67]:
# create a list of columns to change
cols = ['Energy Consumption', 'Community Engagement', 'Health Impact', 'Operational Cost Efficiency', 'Social Impact Level', 'Resource Efficiency']

# loop through the columns and remove the second word
for col in cols:
    df[col] = df[col].str.split().str[0]

df.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,Moderate,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,High,Local,Community Development,Low,High,Moderate,Health-Oriented
1,High,Moderate,High,Moderate,Low,Low,Moderate,Low,Moderate,National,Community Development,Moderate,Low,Moderate,Recreational
2,High,High,High,Low,High,Moderate,Moderate,Moderate,Moderate,National,Community Development,High,Low,High,Recreational
3,High,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate,Regional,Community Development,High,Low,High,Recreational
4,Moderate,High,Low,Low,Low,Moderate,High,High,High,Regional,Community Development,Low,Moderate,Moderate,Community Development


In [68]:
# create a list of columns to encode
encode_cols = ['Energy Consumption', 'Carbon Emissions', 'Waste Generation', 'Community Engagement', 
               'Volunteer Participation', 'Health Impact', 'Water Usage', 'Material Recycling Rate', 
               'Operational Cost Efficiency', 'Sustainability Score' ,'Social Impact Level', 'Resource Efficiency']

# create a loop to encode the data
ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Moderate', 'High']])

for col in encode_cols:
    df[col] = ordinal_encoder.fit_transform(df[[col]])

df.head(10)


Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,Local,Community Development,0.0,2.0,1.0,Health-Oriented
1,2.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,National,Community Development,1.0,0.0,1.0,Recreational
2,2.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,National,Community Development,2.0,0.0,2.0,Recreational
3,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,Regional,Community Development,2.0,0.0,2.0,Recreational
4,1.0,2.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,Regional,Community Development,0.0,1.0,1.0,Community Development
5,1.0,1.0,2.0,1.0,2.0,0.0,1.0,0.0,1.0,Regional,Community Development,2.0,0.0,1.0,Community Development
6,0.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,Regional,Youth-Focused,2.0,1.0,1.0,Youth-Focused
7,2.0,2.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,National,Community Development,1.0,2.0,0.0,Recreational
8,2.0,2.0,1.0,2.0,1.0,2.0,0.0,0.0,1.0,Local,Community Development,1.0,0.0,0.0,Youth-Focused
9,2.0,2.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,Local,Recreational,0.0,1.0,1.0,Community Development


In [69]:
# create a list of columns to encode using one-hot encoding
one_hot_cols = ['Event Scale', 'Event Focus', 'Event Type Classification']

# create a loop to encode the data
one_hot_encoder = OneHotEncoder(drop = 'first', handle_unknown='ignore', sparse_output=False)

# Fit the encoder to the data
one_hot_encoder.fit(df[one_hot_cols])

# Transform the data
encoded_data = one_hot_encoder.transform(df[one_hot_cols])

# Default output is sparse matrix
encoded_data

# Get new feature names
one_hot_encoder.get_feature_names_out()

# Set up the OneHotEncoder so it will transform to Pandas
one_hot_encoder.set_output(transform="pandas")

# Fit and transform the OneHotEncoder to the columns to encode
encoded_data = one_hot_encoder.fit_transform(df[one_hot_cols])
encoded_data.head()

Unnamed: 0,Event Scale_National,Event Scale_Regional,Event Focus_Health-Oriented,Event Focus_Recreational,Event Focus_Youth-Focused,Event Type Classification_Health-Oriented,Event Type Classification_Recreational,Event Type Classification_Youth-Focused
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
# create a copy of the original data
df_copy = df.copy()

# remove the columns that were encoded
df = df.drop(columns=one_hot_cols)

df.columns

Index(['Energy Consumption', 'Carbon Emissions', 'Waste Generation',
       'Community Engagement', 'Volunteer Participation', 'Health Impact',
       'Water Usage', 'Material Recycling Rate', 'Operational Cost Efficiency',
       'Sustainability Score', 'Social Impact Level', 'Resource Efficiency'],
      dtype='object')

In [71]:
# combine the encoded data with the original data
df = pd.concat([df, encoded_data], axis=1)
df.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Sustainability Score,Social Impact Level,Resource Efficiency,Event Scale_National,Event Scale_Regional,Event Focus_Health-Oriented,Event Focus_Recreational,Event Focus_Youth-Focused,Event Type Classification_Health-Oriented,Event Type Classification_Recreational,Event Type Classification_Youth-Focused
0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,2.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# Get the features (everything except the "price" column)
X = df.copy().drop(columns="Sustainability Score")
X.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Social Impact Level,Resource Efficiency,Event Scale_National,Event Scale_Regional,Event Focus_Health-Oriented,Event Focus_Recreational,Event Focus_Youth-Focused,Event Type Classification_Health-Oriented,Event Type Classification_Recreational,Event Type Classification_Youth-Focused
0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,2.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
# Get the target column
y = df["Sustainability Score"].values.reshape(-1,1)
y[0:5]

array([[0.],
       [1.],
       [2.],
       [2.],
       [0.]])

In [74]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [75]:
# Create a function to calculate VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [76]:
# Calculate vif for the dataframe

calc_vif(X).sort_values("VIF")

Unnamed: 0,variables,VIF
16,Event Type Classification_Health-Oriented,1.18609
13,Event Focus_Health-Oriented,1.189076
17,Event Type Classification_Recreational,1.369351
14,Event Focus_Recreational,1.37038
15,Event Focus_Youth-Focused,1.372922
18,Event Type Classification_Youth-Focused,1.375072
11,Event Scale_National,1.574517
4,Volunteer Participation,1.761607
12,Event Scale_Regional,1.803156
7,Material Recycling Rate,2.063006


In [77]:
# Create another X variable by dropping engine-location 
# and the 4 columns with the highest VIF scores

X_vif = X.drop(columns=['Carbon Emissions', 'Energy Consumption', 'Waste Generation', 'Resource Efficiency'])

# Recalculate the VIF scores
calc_vif(X_vif).sort_values('VIF')

Unnamed: 0,variables,VIF
12,Event Type Classification_Health-Oriented,1.173579
9,Event Focus_Health-Oriented,1.175472
13,Event Type Classification_Recreational,1.343955
10,Event Focus_Recreational,1.346005
11,Event Focus_Youth-Focused,1.346248
14,Event Type Classification_Youth-Focused,1.351606
7,Event Scale_National,1.522567
1,Volunteer Participation,1.7137
8,Event Scale_Regional,1.731188
4,Material Recycling Rate,1.985393


In [78]:
# Split the data into training and testing sets
X_full_train, X_full_test, X_vif_train, X_vif_test, y_train, y_test = train_test_split(X, X_vif, y, random_state=14)

In [79]:
# Train two models using the different X variables

# Create the models
gb1 = GradientBoostingClassifier()
gb2 = GradientBoostingClassifier()

# Fit the models
gb1.fit(X_full_train, y_train)
gb2.fit(X_vif_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [80]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [81]:
# Compare the adjusted r-squared of the two models
adj_score1 = r2_adj(X_full_test, y_test, gb1)
adj_score2 = r2_adj(X_vif_test, y_test, gb2)
print(f"1 Feature Adjusted R2: {adj_score1}")
print(f"2 Feature Adjusted R2: {adj_score2}")

1 Feature Adjusted R2: 0.5003728122018039
2 Feature Adjusted R2: 0.5004512343000299
