In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


In [2]:
# import data
df = pd.read_csv('sports_management_dataset.csv')
df.head(10)

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,Moderate,High,High,Moderate Engagement,Moderate,Moderate Impact,Moderate,Moderate,High Efficiency,Local,Community Development,Low,High Engagement,Moderate Efficiency,Health-Oriented
1,High,Moderate,High,Moderate Engagement,Low,Low Impact,Moderate,Low,Moderate Efficiency,National,Community Development,Moderate,Low Engagement,Moderate Efficiency,Recreational
2,High,High,High,Low Engagement,High,Moderate Impact,Moderate,Moderate,Moderate Efficiency,National,Community Development,High,Low Engagement,High Efficiency,Recreational
3,High,High,High,Moderate Engagement,Moderate,Moderate Impact,Moderate,Moderate,Moderate Efficiency,Regional,Community Development,High,Low Engagement,High Efficiency,Recreational
4,Moderate,High,Low,Low Engagement,Low,Moderate Impact,High,High,High Efficiency,Regional,Community Development,Low,Moderate Engagement,Moderate Efficiency,Community Development
5,Moderate,Moderate,High,Moderate Engagement,High,Low Impact,Moderate,Low,Moderate Efficiency,Regional,Community Development,High,Low Engagement,Moderate Efficiency,Community Development
6,Low,High,Moderate,Moderate Engagement,Low,Moderate Impact,Moderate,Moderate,High Efficiency,Regional,Youth-Focused,High,Moderate Engagement,Moderate Efficiency,Youth-Focused
7,High,High,Moderate,High Engagement,Low,High Impact,Low,Low,High Efficiency,National,Community Development,Moderate,High Engagement,Low Efficiency,Recreational
8,High,High,Moderate,High Engagement,Moderate,High Impact,Low,Low,Moderate Efficiency,Local,Community Development,Moderate,Low Engagement,Low Efficiency,Youth-Focused
9,High,High,High,Moderate Engagement,High,Moderate Impact,Low,Low,High Efficiency,Local,Recreational,Low,Moderate Engagement,Moderate Efficiency,Community Development


In [3]:
df['Energy Consumption'].str.split().str[0]

0         Moderate
1             High
2             High
3             High
4         Moderate
            ...   
101995        High
101996    Moderate
101997    Moderate
101998         Low
101999        High
Name: Energy Consumption, Length: 102000, dtype: object

In [4]:
# create a list of columns to change
cols = ['Community Engagement', 'Health Impact', 'Operational Cost Efficiency', 'Social Impact Level', 'Resource Efficiency']

# loop through the columns and remove the second word
for col in cols:
    df[col] = df[col].str.split().str[0]

df.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,Moderate,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,High,Local,Community Development,Low,High,Moderate,Health-Oriented
1,High,Moderate,High,Moderate,Low,Low,Moderate,Low,Moderate,National,Community Development,Moderate,Low,Moderate,Recreational
2,High,High,High,Low,High,Moderate,Moderate,Moderate,Moderate,National,Community Development,High,Low,High,Recreational
3,High,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate,Regional,Community Development,High,Low,High,Recreational
4,Moderate,High,Low,Low,Low,Moderate,High,High,High,Regional,Community Development,Low,Moderate,Moderate,Community Development


In [5]:
# define X and y variables
X= df.drop(columns = 'Sustainability Score')
y = df['Sustainability Score'].values.reshape(-1,1)

# split dataset in to train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 1)
X_train.head()


Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Social Impact Level,Resource Efficiency,Event Type Classification
23375,High,Moderate,Moderate,High,Low,High,Low,Moderate,High,Regional,Youth-Focused,High,Moderate,Community Development
86945,Low,High,Moderate,Low,Low,Moderate,High,Moderate,Low,Regional,Community Development,Moderate,High,Youth-Focused
7149,High,High,Low,Moderate,Low,Low,Moderate,Low,Low,Regional,Community Development,Low,Low,Community Development
85760,Moderate,High,High,Low,Low,Moderate,High,Moderate,Moderate,Local,Community Development,Low,Moderate,Community Development
37809,High,High,High,Low,Low,Moderate,Moderate,Low,Moderate,Local,Youth-Focused,Moderate,Moderate,Community Development


In [6]:
# create a list of columns to encode
ordinal_cols = ['Energy Consumption', 'Carbon Emissions', 'Waste Generation', 'Community Engagement', 
               'Volunteer Participation', 'Health Impact', 'Water Usage', 'Material Recycling Rate', 
               'Operational Cost Efficiency','Social Impact Level', 'Resource Efficiency']


ordinal_encoder =OrdinalEncoder(categories=[['Low', 'Moderate', 'High']], handle_unknown='use_encoded_value', unknown_value=-1)


X_train_ordinal = X_train[ordinal_cols]
X_test_ordinal = X_test[ordinal_cols]


for col in ordinal_cols:
    X_train_ordinal[[col]] = ordinal_encoder.fit_transform(X_train[[col]])
    X_test_ordinal[[col]] = ordinal_encoder.transform(X_test[[col]])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_ordinal[[col]] = ordinal_encoder.fit_transform(X_train[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_ordinal[[col]] = ordinal_encoder.transform(X_test[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_ordinal[[col]] = ordinal_encoder.fit_transform(X_train[[col]]

In [7]:
# create a list of columns to encode using one-hot encoding
nominal_cols = ['Event Scale', 'Event Focus', 'Event Type Classification']

# create a loop to encode the data
one_hot_encoder = OneHotEncoder(drop= 'first', handle_unknown='ignore', sparse_output=False)

# Fit the encoder to the data
one_hot_encoder.fit(X_train[nominal_cols])

# Transform the data
X_train_nominal_col = one_hot_encoder.transform(X_train[nominal_cols])

# Default output is sparse matrix
# encoded_data

# # Get new feature names
one_hot_encoder.get_feature_names_out()

# # Set up the OneHotEncoder so it will transform to Pandas
one_hot_encoder.set_output(transform="pandas")

# # Fit and transform the OneHotEncoder to the columns to encode
X_train_nominal_col = one_hot_encoder.fit_transform(X_train[nominal_cols])
X_test_nominal_col = one_hot_encoder.transform(X_test[nominal_cols])
X_train_nominal_col.head()

Unnamed: 0,Event Scale_National,Event Scale_Regional,Event Focus_Health-Oriented,Event Focus_Recreational,Event Focus_Youth-Focused,Event Type Classification_Health-Oriented,Event Type Classification_Recreational,Event Type Classification_Youth-Focused
23375,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
86945,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7149,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
85760,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37809,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
# combine the encoded data with the original data
X_train= pd.concat([X_train_ordinal, X_train_nominal_col], axis=1)
X_train.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Social Impact Level,Resource Efficiency,Event Scale_National,Event Scale_Regional,Event Focus_Health-Oriented,Event Focus_Recreational,Event Focus_Youth-Focused,Event Type Classification_Health-Oriented,Event Type Classification_Recreational,Event Type Classification_Youth-Focused
23375,2.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0,2.0,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
86945,0.0,2.0,1.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7149,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
85760,1.0,2.0,2.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37809,2.0,2.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [9]:
X_test= pd.concat([X_test_ordinal, X_test_nominal_col], axis=1)
X_test

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Social Impact Level,Resource Efficiency,Event Scale_National,Event Scale_Regional,Event Focus_Health-Oriented,Event Focus_Recreational,Event Focus_Youth-Focused,Event Type Classification_Health-Oriented,Event Type Classification_Recreational,Event Type Classification_Youth-Focused
65226,2.0,1.0,2.0,1.0,2.0,2.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
65591,2.0,2.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
70106,2.0,2.0,2.0,1.0,0.0,0.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84400,2.0,2.0,2.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40117,2.0,2.0,2.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33584,2.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8370,1.0,2.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
42484,2.0,2.0,0.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
45090,1.0,2.0,2.0,0.0,0.0,2.0,1.0,1.0,0.0,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [10]:
# create an instance of the OrdinalEncoder and fit transform the y_train
# transform the y_test
ord_enc_y = OrdinalEncoder(categories=[['Low', 'Moderate', 'High']], handle_unknown='use_encoded_value', unknown_value=-1)

y_train = np.ravel(ord_enc_y.fit_transform(y_train))
y_test= np.ravel(ord_enc_y.transform(y_test))

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
# Train the Gradient Boosting classifier
clf1 = GradientBoostingClassifier(random_state=1).fit(X_train, y_train)

# Evaluate the model
print(f'Training Score: {clf1.score(X_train, y_train)}')
print(f'Testing Score: {clf1.score(X_test, y_test)}')


Training Score: 0.49658823529411766
Testing Score: 0.4983529411764706
