In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer

In [2]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

In [3]:
categorical.shape, numerical.shape, target.shape

((95412, 22), (95412, 315), (95412, 2))

Data Cleaning

In [4]:
print('NaN values in numerical features: ', numerical.isna().sum().any())
print('NaN values in categorical features: ', categorical.isna().sum().any())
print('NaN values in target: ', target.isna().sum().any())

NaN values in numerical features:  False
NaN values in categorical features:  False
NaN values in target:  False


In [5]:
len(categorical.columns)

22

Check categoricals and convert if necessary

In [6]:
[print(col, categorical[col].unique()) for col in categorical.columns];

STATE ['IL' 'CA' 'NC' 'FL' 'other' 'IN' 'MI' 'MO' 'TX' 'WA' 'WI' 'GA']
CLUSTER [36 14 43 44 16 40 39 45 35 53 17 51  2 20 27 12 22 13  8 25 15 42 11 28
 18 24 34  5 31 32 46  3 50  7 37 10 38 21  9 29 30  4 41 49 23 33  1 47
 26 48 19  6 52]
HOMEOWNR ['H' 'U']
GENDER ['F' 'M' 'other']
DATASRCE [3 1 2]
RFA_2R ['L']
RFA_2A ['E' 'G' 'F' 'D']
GEOCODE2 ['C' 'A' 'D' 'B']
DOMAIN_A ['T' 'S' 'R' 'U' 'C']
DOMAIN_B [2 1 3 4]
ODATEW_YR [89 94 90 87 86 88 96 92 93 95 91 97 85 84 83]
ODATEW_MM [ 1  4  2  9 10 11 12  6  8  3  5  7]
DOB_YR [37 52  0 28 20 60 32 23 26 27 54 36 16 43 56 14 48 29 10 13 68 53 46 31
 90 30 58 50 64 18 62 57 44 25 51 65 35 34 38 21 24 11 47 80 39 12 72 22
 40 59 15 17 42 19 61 41 49 33 45 63 81 55 67 69 70 66 76 91 74 71 78  1
 79 73 75  2 84 82 77 87 95 96 88 97 86  8  6  4 92  5 93  7 89  9 85 83]
DOB_MM [12  2  1 11  3  9  7  4 10  8  6  5]
MINRDATE_YR [92 93 91 87 94 95 88 96 89 90 97 86 77 80 75 84 82 78 85 83]
MINRDATE_MM [ 8 10 11  7  5  9  2 12  6  1  4  3]
MAXRDATE

In [7]:
def convert_to_numerical(dataset_old, dataset_new, features):
    for f in features:
        dataset_new[f] = pd.to_numeric(dataset_old[f], errors='coerce')
        dataset_old.drop(f, axis=1, inplace=True)

In [8]:
features_to_num = categorical[categorical.columns[~categorical.columns.isin(['STATE', 'HOMEOWNR', 
                                                                             'GENDER', 'RFA_2R', 'RFA_2A', 
                                                                             'GEOCODE2', 'DOMAIN_A'])]].columns
convert_to_numerical(categorical, numerical, features_to_num)

Preprocessing & Model

In [9]:
# set up X and y
y = target['TARGET_B']
X = numerical.join(categorical)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# set up preprocessing step and fit on train data
def custom_preprocessor():
    return ColumnTransformer([
        ("numericals", StandardScaler(), numerical.columns), 
        ("categoricals", OneHotEncoder(sparse=False), categorical.columns)
    ])
    
cp = custom_preprocessor()
cp.fit(X_train);



In [12]:
smote = SMOTE(random_state=42)

X_train_scaled = cp.transform(X_train)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [None]:
rfc = RandomForestClassifier(random_state=9)
rfc.fit(X_train_resampled, y_train_resampled)

Check for feature importance

In [None]:
pd.DataFrame.from_dict(dict(zip(X.columns, rfc.feature_importances_)), orient='index').sort_values(by=0, ascending=False)

Random Forest Score

In [None]:
rfc.score(cp.transform(X_test), y_test)