In [3]:
#import dependencies.
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder , StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import matthews_corrcoef
from sklearn.cross_validation import train_test_split
from imblearn.over_sampling import SMOTE

In [4]:
train = pd.read_csv('./data/train.csv',sep=";")

## Data Preprocessing

In [5]:
#this removes a column for the train data, ot match the test shape. 
cols_to_test_data = train.columns[:-1]

In [6]:
test = pd.read_csv('./data/test.csv',sep=";",header=None)

In [7]:
test.columns = cols_to_test_data

In [8]:
train.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [9]:
train.shape, test.shape

((2999, 21), (1120, 20))

In [10]:
#Instaniating the label enconder to convert categorical data
le = LabelEncoder()

In [11]:
#creating a list of all columns that are objects, to create numerical values. 
object_columns = list(train.dtypes[train.dtypes == 'object'].index)

In [12]:
#changing all object columns to a numerical value to measure. 
for col in object_columns:
    exec(('train[{!r}] = le.fit_transform(train[{!r}])').format(col,col))

In [13]:
#changed to 0 since they were never contacted
train['pdays']= train['pdays'].replace(999,0)

# Preparing The Data for Modeling.

In [14]:
#creating variables for x and y. 
X = train.drop(columns='y')
y = train['y']

In [15]:
#gathering all categorical columns
categorical_cols = X[['job','marital','education','default','housing','loan','contact','month','day_of_week','previous',
      'poutcome']]

#### Creating dummies for the X and Y variables. (a.k.a OneHotEncoding)
We do this for the X variables that are categorical, because in the job column `'blue_collar'` is not 1 > `'services'`. <br />
This is done as well for Y because we want them to be a 1 if exists, and a 0 if doesn't. <br />
We can reduce this down to 1 column because 1 column can tell us if it  is present or not.

In [16]:
X_dums = pd.get_dummies(X,columns=list(categorical_cols.columns),drop_first=True)

y_dums = pd.get_dummies(y,drop_first=True)

A class imbalance exists as we can see below. In order to combat this, we are going to use `SMOTE` from `imblearn.over_sampling`. <br/>
SMOTE stands for Sampling Minority Oversampling Technique. Ed

In [22]:
#Percentages of each value
y_dums[1].value_counts()/ len(y_dums[1])

0    0.88963
1    0.11037
Name: 1, dtype: float64

In [24]:
#import SMOTE to oversample the loan offers since there are so few. 
sm = SMOTE(random_state=42)

In [28]:
#fitting the data to the new oversampled minority amounts.  
X_res, y_res = sm.fit_sample(X_dums, y_dums[1])

In [49]:
#Taking the total of 1's and checking to see the amount over the new sampled y's.
y_res.sum()/len(y_res)

0.5

In [29]:
#scaling the data to be able to compare them on the same scale values. 
ss = StandardScaler()
ss_fit = ss.fit_transform(X_res)

In [30]:
#creating a test train split to test the data.
X_train, X_test, y_train, y_test = train_test_split(ss_fit, y_res, test_size=0.33,
                                                    random_state=42,
                                                    stratify=y_res) #preserves class balance between source and split set)

#### Setting up the model and reducing the features

In [31]:
#Creating the extra tress classifier. Set a min_samle leaf of 3, this helps to prevent against overfitting on the train
clf = ExtraTreesClassifier(n_estimators=10,min_samples_leaf=3)

In [32]:
#using recurisve feature extraction to reduce the amount of features used. 
rfe = RFECV(clf,n_jobs=5)

In [34]:
#fitting the data to the model.
rfe.fit(X_train,y_train)

RFECV(cv=None,
   estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
   n_jobs=5, scoring=None, step=1, verbose=0)

In [35]:
#using mse to score the data for predictions. 
rfe.score(X_test,y_test)

0.9500283929585462

In [36]:
y_hat = rfe.predict(X_test)

We are using the **Matthews correlation coefficient (MCC)**  because it helps us to measure our classification coeffiencts more accurately. When data has an imbalance of 0's vs 1's it causes the interpretation to be too skewed. For example, If you were in a desert, predicting that it's not going to rain is easy because you are more likely to be right due to the imbalance. However if you're in Texas, predicting if it's going to rain or not because more difficult. Therefore, this coeffeienct will tell us, based on the data that we have, how likely are we able to predict how likley someone is to take out a loan, based on our marketing campaigns.

In [37]:
matthews_corrcoef(y_test,y_hat)

0.9002064197862143

### Speeding up the process
Since we hand done the work before, I took what we had done previously and created a function to apply data processing that we had done to the train data. <br/>
A validation set, known as test data in this case, will be used to predict and see how we scored against the acutal results.

In [38]:
def clean_data(data):
    
    #changing all object columns to a numerical value to measure. 
    object_columns = list(data.dtypes[data.dtypes == 'object'].index)
    
    for col in object_columns:
        exec(('data[{!r}] = le.fit_transform(data[{!r}])').format(col,col))

    #changed to 0 since they were never contacted
    data['pdays']= data['pdays'].replace(999,0)

    categorical_cols = data[['job','marital','education','default','housing','loan','contact','month','day_of_week','previous',
          'poutcome']]

    X_dums = pd.get_dummies(data,columns=list(categorical_cols.columns),drop_first=True)

    ss_fit = ss.fit_transform(X_dums)
    return ss_fit

In [39]:
#using our new function to clean the data and label our cleaned and standard scaled test data. 
test_data = clean_data(test)

### Preparing the Data for submission

In [40]:
#creating the prediction using our RFE-ExtraTreeClassifer.
predictions = rfe.predict(test_data)

#creates a dataframe to view results and export it to a CSV
preds = pd.DataFrame(predictions)

#creating a range specififed by Kaggle
preds['Id'] = range(1,1121)

#changing the column names to match the final submission
preds.columns = ['prediction','Id']

#Putting the data in the proper order. 
preds = preds[['Id','prediction']]

# Exporting the file to CSV for Submssion

In [41]:
#exporting the data
preds.to_csv('./data/submission.csv',index=False)