In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [19]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [29]:
# https://help.lendingclub.com/hc/en-us/articles/215488038-What-do-the-different-Note-statuses-mean-

columns = [
    "acousticness",
    "danceability",
    "duration_ms",
    "energy",
    "explicit",
    "instrumentalness",
    "key",
    "liveness",
    "loudness",
    "mode",
    "popularity",
    "speechiness",
    "tempo",
    "valence",
    "year"
]

target = ["popularity"]

In [36]:
# Load the data
file_path = Path('clean_spotify.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
###issued_mask = df['loan_status'] != 'Issued'
###df = df.loc[issued_mask]

# convert interest rate to numerical
###df['int_rate'] = df['int_rate'].str.replace('%', '')
###df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
###x = {'Current': 'low_risk'}   
###df = df.replace(x)

###x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
###df = df.replace(x)

###df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
0,0.0495,0.612,240400,0.807,0,0.0177,10,0.101,-2.81,1,89,0.0336,124.053,0.398,2013
1,0.695,0.445,244360,0.537,1,1.7e-05,4,0.0944,-8.532,0,88,0.04,122.769,0.131,2013
2,0.00286,0.908,290320,0.669,1,0.0,7,0.237,-2.827,1,87,0.0738,112.238,0.662,2002
3,0.0302,0.949,284200,0.661,1,0.0,5,0.0454,-4.244,0,86,0.0572,104.504,0.76,2000
4,0.0371,0.764,196520,0.705,0,1.9e-05,3,0.0943,-5.279,0,86,0.0278,101.003,0.672,2015


# Split the Data into Training and Testing

In [31]:
# Create our features
#X = YOUR CODE HERE
X = df.drop(columns='popularity')
X = pd.get_dummies(X)

# Create our target
#y = # YOUR CODE HERE
y = df.loc[:, target].copy()

In [32]:
X.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,year
count,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0
mean,0.128173,0.667814,228594.973725,0.721549,0.276662,0.015372,5.369397,0.181726,-5.514082,0.553323,0.103783,120.158442,0.552966,2009.52035
std,0.172584,0.140608,39249.796103,0.152872,0.447463,0.088371,3.61527,0.14091,1.93895,0.497277,0.096148,26.990475,0.220845,5.875532
min,1.9e-05,0.129,113000.0,0.0549,0.0,0.0,0.0,0.0215,-20.514,0.0,0.0232,60.019,0.0381,1998.0
25%,0.0135,0.581,203506.0,0.624,0.0,0.0,2.0,0.0884,-6.49,0.0,0.0397,98.986,0.39,2004.0
50%,0.0558,0.676,223186.0,0.739,0.0,0.0,6.0,0.124,-5.285,1.0,0.061,120.028,0.56,2010.0
75%,0.176,0.765,247946.0,0.84,1.0,6.9e-05,8.0,0.242,-4.168,1.0,0.129,134.199,0.731,2015.0
max,0.976,0.975,484146.0,0.999,1.0,0.985,11.0,0.853,-0.276,1.0,0.576,210.851,0.973,2020.0


In [33]:
# Check the balance of our target values
y['popularity'].value_counts()

0     126
69     74
68     73
73     67
74     65
     ... 
16      1
17      1
18      1
24      1
89      1
Name: popularity, Length: 76, dtype: int64

In [34]:
# Split the data set into Train & Test values
from sklearn.model_selection import train_test_split
# YOUR CODE HERE
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [35]:
#view ytrain counts
y_train.value_counts()

popularity
0             92
73            57
68            57
69            56
61            51
              ..
16             1
17             1
24             1
26             1
88             1
Length: 73, dtype: int64

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [25]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [26]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7885466545953005

In [30]:
# Create a DataFrame from the confusion matrix.
# YOUR CODE HERE
confusion_matrix(y_test, y_pred)

array([[   71,    30],
       [ 2153, 14951]], dtype=int64)

In [31]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,71,30
Actual low_risk,2153,14951


In [18]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.03      0.70      0.87      0.06      0.78      0.60       101
   low_risk       1.00      0.87      0.70      0.93      0.78      0.62     17104

avg / total       0.99      0.87      0.70      0.93      0.78      0.62     17205



In [19]:
# List the features sorted in descending order by feature importance
# YOUR CODE HERE
# Get importances and features
importances = brf.feature_importances_
cols = X.columns

# Store in a DataFrame
feature_importances_df = pd.DataFrame({'feature':cols, 'importance': importances})
feature_importances_df.head()

Unnamed: 0,feature,importance
0,loan_amnt,0.01058
1,int_rate,0.02967
2,installment,0.019802
3,annual_inc,0.014833
4,dti,0.017471


In [20]:
# Sort the DataFrame - Not in starter code
feature_importances_df.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
15,total_rec_prncp,0.078768
13,total_pymnt,0.058838
14,total_pymnt_inv,0.056256
16,total_rec_int,0.053555
20,last_pymnt_amnt,0.050033
...,...,...
19,collection_recovery_fee,0.000000
22,policy_code,0.000000
23,acc_now_delinq,0.000000
46,delinq_amnt,0.000000


### Easy Ensemble AdaBoost Classifier

In [33]:
# Train the EasyEnsembleClassifier
# YOUR CODE HERE
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)

eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [34]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9316600714093861

In [23]:
# Create a DataFrame from the confusion matrix.
# YOUR CODE HERE
confusion_matrix(y_test, y_pred)

array([[   93,     8],
       [  983, 16121]], dtype=int64)

In [35]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"])
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,93,8
Actual low_risk,983,16121


In [24]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.09      0.92      0.94      0.16      0.93      0.87       101
   low_risk       1.00      0.94      0.92      0.97      0.93      0.87     17104

avg / total       0.99      0.94      0.92      0.97      0.93      0.87     17205

