In [1]:
#import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
#import the file from github to a dataframe
url = "https://raw.githubusercontent.com/Bropell/Asthma_Analysis_in_California_Counties/main/Databases/Merged_Datasets.csv"
ap_df = pd.read_csv(url)

ap_df = ap_df.set_index("County")

In [2]:
#column names
ap_df.columns

Index(['Pollutant: 1,3-butadiene', 'Pollutant: Acetaldehyde',
       'Pollutant: Benzene', 'Pollutant: Carbon tetrachloride',
       'Pollutant: Formaldehyde', 'Target', 'TotalPop', 'Men', 'Women',
       'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'Income',
       'IncomePerCap', 'Poverty', 'ChildPoverty', 'Professional', 'Service',
       'Office', 'Construction', 'Production', 'Drive', 'Carpool', 'Transit',
       'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute', 'Employed',
       'PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork',
       'Unemployment', 'Max_Voltage', 'Total EV Chargers',
       'Private EV Chargers', 'Public EV Chargers', 'AcresBurned',
       'Registered EV Count'],
      dtype='object')

In [3]:
# Create our features
X = pd.get_dummies(ap_df)
X = X.drop(columns=["Target_t","Target_f"])

# Create our target
y = pd.get_dummies(ap_df)
y = y["Target_t"]


In [4]:
#train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# fit the model and create prediction array
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(random_state=1)
brfc = brfc.fit(X_train,y_train)
predictions = brfc.predict(X_test)
predictions

array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1], dtype=uint8)

In [6]:
# Balanced Accuracy Score 
y_pred = predictions
print("Balanced Accuracy Score")
balanced_accuracy_score(y_test, y_pred)

Balanced Accuracy Score


0.5982142857142857

In [7]:
# Confusion Matrix
print("Confusion Matrix")
confusion_matrix(y_test, y_pred)

Confusion Matrix


array([[5, 3],
       [3, 4]], dtype=int64)

In [8]:
# Print the imbalanced classification report
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.62      0.62      0.57      0.62      0.60      0.36         8
          1       0.57      0.57      0.62      0.57      0.60      0.36         7

avg / total       0.60      0.60      0.60      0.60      0.60      0.36        15



In [9]:
# Calculate the feature importance
importances = brfc.feature_importances_
importances

array([0.01541088, 0.02645088, 0.02491339, 0.        , 0.03060752,
       0.02415284, 0.02314055, 0.01564594, 0.01669123, 0.02441451,
       0.03769759, 0.0182453 , 0.02184982, 0.02220931, 0.01752965,
       0.01677897, 0.01393664, 0.01892557, 0.02105858, 0.01251622,
       0.01590761, 0.01988534, 0.01418836, 0.01535563, 0.01252431,
       0.03172497, 0.03308336, 0.0299724 , 0.02337432, 0.03049675,
       0.01866467, 0.01138172, 0.01314275, 0.02236577, 0.04040524,
       0.01856003, 0.05162988, 0.04418512, 0.02306118, 0.04799692,
       0.0331096 , 0.0468087 ])

In [10]:
# Sort features by their importance
sorted(zip(brfc.feature_importances_, X.columns), reverse=True)

[(0.05162987579284694, 'Max_Voltage'),
 (0.0479969177039576, 'Public EV Chargers'),
 (0.046808700686528724, 'Registered EV Count'),
 (0.04418512226807164, 'Total EV Chargers'),
 (0.04040523628715015, 'FamilyWork'),
 (0.03769758958255173, 'Black'),
 (0.03310959517994914, 'AcresBurned'),
 (0.033083357690871645, 'Walk'),
 (0.031724968462319, 'Transit'),
 (0.030607520299150415, 'Pollutant: Formaldehyde'),
 (0.03049674658116731, 'MeanCommute'),
 (0.029972398774072748, 'OtherTransp'),
 (0.026450879537164816, 'Pollutant: Acetaldehyde'),
 (0.024913390618801476, 'Pollutant: Benzene'),
 (0.024414505305158013, 'White'),
 (0.024152842142497244, 'TotalPop'),
 (0.02337432450737943, 'WorkAtHome'),
 (0.023140549828000286, 'Men'),
 (0.023061182196831967, 'Private EV Chargers'),
 (0.022365767089229518, 'SelfEmployed'),
 (0.02220930608151818, 'Pacific'),
 (0.02184982053641406, 'Asian'),
 (0.02105858145631949, 'Professional'),
 (0.019885340413945015, 'Construction'),
 (0.01892557103807449, 'ChildPoverty')