In [9]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

In [10]:
# import data
df = pd.read_csv('sports_management_encoded.csv')
df.head()

Unnamed: 0,Energy Consumption,Carbon Emissions,Waste Generation,Community Engagement,Volunteer Participation,Health Impact,Water Usage,Material Recycling Rate,Operational Cost Efficiency,Sustainability Score,Social Impact Level,Resource Efficiency,Event Scale_National,Event Scale_Regional,Event Focus_Health-Oriented,Event Focus_Recreational,Event Focus_Youth-Focused,Event Type Classification_Health-Oriented,Event Type Classification_Recreational,Event Type Classification_Youth-Focused
0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.0,2.0,2.0,0.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,2.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102000 entries, 0 to 101999
Data columns (total 20 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   Energy Consumption                         102000 non-null  float64
 1   Carbon Emissions                           102000 non-null  float64
 2   Waste Generation                           102000 non-null  float64
 3   Community Engagement                       102000 non-null  float64
 4   Volunteer Participation                    102000 non-null  float64
 5   Health Impact                              102000 non-null  float64
 6   Water Usage                                102000 non-null  float64
 7   Material Recycling Rate                    102000 non-null  float64
 8   Operational Cost Efficiency                102000 non-null  float64
 9   Sustainability Score                       102000 non-null  float64
 10  Social I

In [12]:
# create an x and y variable
X = df.drop('Sustainability Score', axis=1)
y = df['Sustainability Score']

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# Review the distinct values from y
y_train.value_counts()

Sustainability Score
2.0    37989
1.0    23219
0.0    15292
Name: count, dtype: int64

In [15]:
#RandomForestClassifier

In [16]:
# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train, y_train)

In [17]:
# Predict labels for original scaled testing features
y_pred = model.predict(X_test)

In [18]:
#Random Undersampler

In [19]:
# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

In [20]:
# Fit the training data to the random undersampler model
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

In [21]:
# Count distinct values for the resampled target data
y_undersampled.value_counts()

Sustainability Score
0.0    15292
1.0    15292
2.0    15292
Name: count, dtype: int64

In [22]:
# Instantiate a new RandomForestClassier model
model_undersampled = RandomForestClassifier()

# Fit the undersampled data the new model
model_undersampled.fit(X_undersampled, y_undersampled)

In [23]:
# Predict labels for oversampled testing features
y_pred_undersampled = model_undersampled.predict(X_test)

In [24]:
#Random Oversampler

In [25]:
# Instantiate a RandomOversampler instance
ros = RandomOverSampler(random_state=1)

In [26]:
# Fit the training data to the `RandomOverSampler` model
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

In [27]:
# Count distinct values
y_oversampled.value_counts()

Sustainability Score
2.0    37989
1.0    37989
0.0    37989
Name: count, dtype: int64

In [28]:
# Instantiate a new RandomForestClassier model
model_oversampled = RandomForestClassifier()

# Fit the oversampled data the new model
model_oversampled.fit(X_oversampled, y_oversampled)

In [29]:
# Predict labels for oversampled testing features
y_pred_oversampled = model_oversampled.predict(X_test)

In [30]:
#Cluster Centroids

In [31]:
# Instantiate a ClusterCentroids instance
cc_sampler = ClusterCentroids(random_state=1)

In [32]:
# Fit the training data to the cluster centroids model
X_resampled, y_resampled = cc_sampler.fit_resample(X_train, y_train)

In [33]:
# Count distinct values for the resampled target data
y_resampled.value_counts()

Sustainability Score
0.0    15292
1.0    15292
2.0    15292
Name: count, dtype: int64

In [34]:
# Instantiate a new RandomForestClassier model
cc_model = RandomForestClassifier()

# Fit the resampled data the new model
cc_model.fit(X_resampled, y_resampled)

In [35]:
# Predict labels for resampled testing features
cc_y_pred = cc_model.predict(X_test)

In [36]:
#SMOTE

In [37]:
# Instantiate the SMOTE instance 
# Set the sampling_strategy parameter equal to auto
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

In [38]:
# Fit the training data to the smote_sampler model
X_resampled, y_resampled = smote_sampler.fit_resample(X_train, y_train)

In [39]:
# Count distinct values for the resampled target data
y_resampled.value_counts()

Sustainability Score
2.0    37989
1.0    37989
0.0    37989
Name: count, dtype: int64

In [40]:
# Instantiate a new RandomForestClassier model 
smote_model = RandomForestClassifier()

# Fit the resampled data to the new model
smote_model.fit(X_resampled, y_resampled)

In [41]:
# Predict labels for resampled testing features
smote_y_pred = smote_model.predict(X_test)

In [42]:
#SMOTEENN

In [43]:
# Instantiate the SMOTEENN instance
smote_enn = SMOTEENN(random_state=1)

In [44]:
# Fit the model to the training data
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

In [45]:
# Instantiate a new RandomForestClassier model
smoteenn_model = RandomForestClassifier()

# Fit the resampled data the new model
smoteenn_model.fit(X_resampled, y_resampled)

In [46]:
# Predict labels for resampled testing features
smoteenn_y_pred = smoteenn_model.predict(X_test)

In [47]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))
print("---------")
print(f"Classification Report - Oversampled Data")
print(classification_report(y_test, y_pred_oversampled))
print("---------")
print(f"Classification Report - Resampled Data - CentroidClusters")
print(classification_report(y_test, cc_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, smote_y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, smoteenn_y_pred))

Classification Report - Original Data
              precision    recall  f1-score   support

         0.0       0.18      0.05      0.08      5120
         1.0       0.31      0.18      0.23      7672
         2.0       0.50      0.77      0.61     12708

    accuracy                           0.45     25500
   macro avg       0.33      0.33      0.30     25500
weighted avg       0.38      0.45      0.39     25500

---------
Classification Report - Undersampled Data
              precision    recall  f1-score   support

         0.0       0.20      0.34      0.25      5120
         1.0       0.30      0.34      0.32      7672
         2.0       0.51      0.33      0.40     12708

    accuracy                           0.33     25500
   macro avg       0.34      0.34      0.32     25500
weighted avg       0.38      0.33      0.35     25500

---------
Classification Report - Oversampled Data
              precision    recall  f1-score   support

         0.0       0.19      0.12      0.1