In [1]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [2]:
#Importing data
data = pd.read_csv('final_data.csv')

In [3]:
#Seperating into X and Y
X = data.iloc[:, 1: -1]
Y = data.iloc[:, -1]

In [4]:
#finding correlated features
correlated_features = set()
correlation_matrix = X.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [5]:
#Dropping features that have a high correlation
X = X.drop(correlated_features,axis=1)

In [6]:
#Spliting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

In [7]:
#Applying Oversampling
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [8]:
#Finding the top 5 features from the dataset
clf=RandomForestClassifier()
rfecv = RFE(clf, 5)
rfecv.fit(X_train_res, y_train_res)

RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=None,
                                     oob_score=False, random_state=None,
                                     verbose=0, warm_start=False),
    n_features_to_select=5, step=1, verbose=0)

In [9]:
print(sorted(zip(map(lambda x: round(x, 4), rfecv.ranking_), X)))

[(1, 'credit_card_balance'), (1, 'credit_card_usage'), (1, 'debt_to_income'), (1, 'interest_rate'), (1, 'loan_amount'), (2, 'annual_income'), (3, 'credit_score'), (4, 'term_ 36 months'), (5, 'employment_length'), (6, 'nr_accounts'), (7, 'home_ownership_MORTGAGE'), (8, 'inq_last_6mths_cat_no'), (9, 'open_accounts'), (10, 'purpose_credit_card'), (11, 'purpose_debt_consolidation'), (12, 'year'), (13, 'issue_date_Oct'), (14, 'issue_date_Jul'), (15, 'issue_date_Aug'), (16, 'issue_date_Sep'), (17, 'issue_date_Nov'), (18, 'issue_date_Jan'), (19, 'issue_date_Jun'), (20, 'issue_date_May'), (21, 'issue_date_Apr'), (22, 'issue_date_Mar'), (23, 'issue_date_Dec'), (24, 'issue_date_Feb'), (25, 'delinq_2yrs_cat_no'), (26, 'home_ownership_OWN'), (27, 'purpose_other'), (28, 'purpose_home_improvement'), (29, 'pub_rec_cat_no'), (30, 'purpose_major_purchase'), (31, 'purpose_small_business'), (32, 'purpose_car'), (33, 'purpose_medical'), (34, 'purpose_wedding'), (35, 'purpose_moving'), (36, 'purpose_house'