In [273]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [274]:
# import data
df = pd.read_csv('sports_management_dataset.csv')
df.head(10)

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,Moderate,High,High,Moderate Engagement,Moderate,Moderate Impact,Moderate,Moderate,High Efficiency,Local,Community Development,Low,High Engagement,Moderate Efficiency,Health-Oriented
1,High,Moderate,High,Moderate Engagement,Low,Low Impact,Moderate,Low,Moderate Efficiency,National,Community Development,Moderate,Low Engagement,Moderate Efficiency,Recreational
2,High,High,High,Low Engagement,High,Moderate Impact,Moderate,Moderate,Moderate Efficiency,National,Community Development,High,Low Engagement,High Efficiency,Recreational
3,High,High,High,Moderate Engagement,Moderate,Moderate Impact,Moderate,Moderate,Moderate Efficiency,Regional,Community Development,High,Low Engagement,High Efficiency,Recreational
4,Moderate,High,Low,Low Engagement,Low,Moderate Impact,High,High,High Efficiency,Regional,Community Development,Low,Moderate Engagement,Moderate Efficiency,Community Development
5,Moderate,Moderate,High,Moderate Engagement,High,Low Impact,Moderate,Low,Moderate Efficiency,Regional,Community Development,High,Low Engagement,Moderate Efficiency,Community Development
6,Low,High,Moderate,Moderate Engagement,Low,Moderate Impact,Moderate,Moderate,High Efficiency,Regional,Youth-Focused,High,Moderate Engagement,Moderate Efficiency,Youth-Focused
7,High,High,Moderate,High Engagement,Low,High Impact,Low,Low,High Efficiency,National,Community Development,Moderate,High Engagement,Low Efficiency,Recreational
8,High,High,Moderate,High Engagement,Moderate,High Impact,Low,Low,Moderate Efficiency,Local,Community Development,Moderate,Low Engagement,Low Efficiency,Youth-Focused
9,High,High,High,Moderate Engagement,High,Moderate Impact,Low,Low,High Efficiency,Local,Recreational,Low,Moderate Engagement,Moderate Efficiency,Community Development


In [275]:
# get an overview of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102000 entries, 0 to 101999
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   Energy Consumption           102000 non-null  object
 1   Carbon Emissions             102000 non-null  object
 2   Waste Generation             102000 non-null  object
 3   Community Engagement         102000 non-null  object
 4   Volunteer Participation      102000 non-null  object
 5   Health Impact                102000 non-null  object
 6   Water Usage                  102000 non-null  object
 7   Material Recycling Rate      102000 non-null  object
 8   Operational Cost Efficiency  102000 non-null  object
 9   Event Scale                  102000 non-null  object
 10  Event Focus                  102000 non-null  object
 11  Sustainability Score         102000 non-null  object
 12  Social Impact Level          102000 non-null  object
 13  Resource Effic

In [276]:
# count the number of values and percentage of total in Sustainability Score column
df['Sustainability Score'].value_counts(normalize=True)


Sustainability Score
High        0.497029
Moderate    0.302853
Low         0.200118
Name: proportion, dtype: float64

In [277]:
# check for number of unique values for each column to get an idea of how to encode the data
df.nunique()

Energy Consumption             3
Carbon Emissions               3
Waste Generation               3
Community Engagement           3
Volunteer Participation        3
Health Impact                  3
Water Usage                    3
Material Recycling Rate        3
Operational Cost Efficiency    3
Event Scale                    3
Event Focus                    4
Sustainability Score           3
Social Impact Level            3
Resource Efficiency            3
Event Type Classification      4
dtype: int64

In [278]:
df['Energy Consumption'].str.split().str[0]

0         Moderate
1             High
2             High
3             High
4         Moderate
            ...   
101995        High
101996    Moderate
101997    Moderate
101998         Low
101999        High
Name: Energy Consumption, Length: 102000, dtype: object

In [279]:
# create a list of columns to change
cols = ['Community Engagement', 'Health Impact', 'Operational Cost Efficiency', 'Social Impact Level', 'Resource Efficiency']

# loop through the columns and remove the second word
for col in cols:
    df[col] = df[col].str.split().str[0]

df.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,Moderate,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,High,Local,Community Development,Low,High,Moderate,Health-Oriented
1,High,Moderate,High,Moderate,Low,Low,Moderate,Low,Moderate,National,Community Development,Moderate,Low,Moderate,Recreational
2,High,High,High,Low,High,Moderate,Moderate,Moderate,Moderate,National,Community Development,High,Low,High,Recreational
3,High,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate,Regional,Community Development,High,Low,High,Recreational
4,Moderate,High,Low,Low,Low,Moderate,High,High,High,Regional,Community Development,Low,Moderate,Moderate,Community Development


In [280]:
# create a list of columns to encode
encode_cols = ['Energy Consumption', 'Carbon Emissions', 'Waste Generation', 'Community Engagement', 
               'Volunteer Participation', 'Health Impact', 'Water Usage', 'Material Recycling Rate', 
               'Operational Cost Efficiency', 'Sustainability Score' ,'Social Impact Level', 'Resource Efficiency']

# create a loop to encode the data
ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Moderate', 'High']])

for col in encode_cols:
    df[col] = ordinal_encoder.fit_transform(df[[col]])

df.head(10)


Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Event Scale,Event Focus,Sustainability Score,Social Impact Level,Resource Efficiency,Event Type Classification
0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,Local,Community Development,0.0,2.0,1.0,Health-Oriented
1,2.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,National,Community Development,1.0,0.0,1.0,Recreational
2,2.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,National,Community Development,2.0,0.0,2.0,Recreational
3,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,Regional,Community Development,2.0,0.0,2.0,Recreational
4,1.0,2.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,Regional,Community Development,0.0,1.0,1.0,Community Development
5,1.0,1.0,2.0,1.0,2.0,0.0,1.0,0.0,1.0,Regional,Community Development,2.0,0.0,1.0,Community Development
6,0.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,Regional,Youth-Focused,2.0,1.0,1.0,Youth-Focused
7,2.0,2.0,1.0,2.0,0.0,2.0,0.0,0.0,2.0,National,Community Development,1.0,2.0,0.0,Recreational
8,2.0,2.0,1.0,2.0,1.0,2.0,0.0,0.0,1.0,Local,Community Development,1.0,0.0,0.0,Youth-Focused
9,2.0,2.0,2.0,1.0,2.0,1.0,0.0,0.0,2.0,Local,Recreational,0.0,1.0,1.0,Community Development


In [281]:
# create a list of columns to encode using one-hot encoding
one_hot_cols = ['Event Scale', 'Event Focus', 'Event Type Classification']

# create a loop to encode the data
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit the encoder to the data
one_hot_encoder.fit(df[one_hot_cols])

# Transform the data
encoded_data = one_hot_encoder.transform(df[one_hot_cols])

# Default output is sparse matrix
encoded_data

# Get new feature names
one_hot_encoder.get_feature_names_out()

# Set up the OneHotEncoder so it will transform to Pandas
one_hot_encoder.set_output(transform="pandas")

# Fit and transform the OneHotEncoder to the columns to encode
encoded_data = one_hot_encoder.fit_transform(df[one_hot_cols])
encoded_data.head()

Unnamed: 0,Event Scale_Local,Event Scale_National,Event Scale_Regional,Event Focus_Community Development,Event Focus_Health-Oriented,Event Focus_Recreational,Event Focus_Youth-Focused,Event Type Classification_Community Development,Event Type Classification_Health-Oriented,Event Type Classification_Recreational,Event Type Classification_Youth-Focused
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [282]:
encoded_data.columns

Index(['Event Scale_Local', 'Event Scale_National', 'Event Scale_Regional',
       'Event Focus_Community Development', 'Event Focus_Health-Oriented',
       'Event Focus_Recreational', 'Event Focus_Youth-Focused',
       'Event Type Classification_Community Development',
       'Event Type Classification_Health-Oriented',
       'Event Type Classification_Recreational',
       'Event Type Classification_Youth-Focused'],
      dtype='object')

In [283]:
# remove the words prior to the _ in the column names
encoded_data.columns = encoded_data.columns.str.split('_').str[1]
encoded_data.head()

Unnamed: 0,Local,National,Regional,Community Development,Health-Oriented,Recreational,Youth-Focused,Community Development.1,Health-Oriented.1,Recreational.1,Youth-Focused.1
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [284]:
# add Event on the end of the column names
encoded_data.columns = encoded_data.columns + ' Event'
encoded_data.head()

Unnamed: 0,Local Event,National Event,Regional Event,Community Development Event,Health-Oriented Event,Recreational Event,Youth-Focused Event,Community Development Event.1,Health-Oriented Event.1,Recreational Event.1,Youth-Focused Event.1
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [285]:
# create a copy of the original data
df_copy = df.copy()

# remove the columns that were encoded
df = df.drop(columns=one_hot_cols)

df.columns

Index(['Energy Consumption', 'Carbon Emissions', 'Waste Generation',
       'Community Engagement', 'Volunteer Participation', 'Health Impact',
       'Water Usage', 'Material Recycling Rate', 'Operational Cost Efficiency',
       'Sustainability Score', 'Social Impact Level', 'Resource Efficiency'],
      dtype='object')

In [286]:
# combine the encoded data with the original data
df = pd.concat([df, encoded_data], axis=1)
df.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Sustainability Score,...,National Event,Regional Event,Community Development Event,Health-Oriented Event,Recreational Event,Youth-Focused Event,Community Development Event.1,Health-Oriented Event.1,Recreational Event.1,Youth-Focused Event.1
0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,2.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
