# Assignment 2 - Create a Classifier

* This assignment requires the creation of a classifier to predict the outcome of a banks marketing campaign
* This will be completed in various steps:
    * Creating a Data Quality report to assess the data
    * Prepare and Analyse the data
    * Build and train a classifier to make predictions (classifier to be decided)

## 1) Describe the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib as mpl
# Make the graphs a bit prettier, and bigger 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

### Get the Column Names

* loops through data description file and parses the column names

In [3]:
col_headers = []

with open('data/dataDescription.txt', 'r') as dd:
    for line in dd:
        if line[0].isdigit():
            items = line.split(' ')
            col_headers.append(items[2].strip().replace(':', ''))

In [4]:
df = pd.read_csv('data/trainingSet.txt', header=None)

In [5]:
#assigns column headers to the data
df.columns = col_headers

In [6]:
df.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,TR1,44,JobCat9,single,secondary,no,29,yes,no,unknown,5,may,0,1,-1,0,unknown,TypeA
1,TR2,31,JobCat4,married,secondary,no,2,yes,yes,unknown,5,may,0,1,-1,0,unknown,TypeA
2,TR3,42,JobCat4,divorced,tertiary,yes,2,yes,no,unknown,5,may,0,1,-1,0,unknown,TypeA
3,TR4,58,JobCat2,married,primary,no,121,yes,no,unknown,5,may,0,1,-1,0,unknown,TypeA
4,TR5,43,JobCat9,single,secondary,no,593,yes,no,unknown,5,may,0,1,-1,0,unknown,TypeA


In [7]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,24318.0,24318.0,24318.0,24318.0,24318.0,24318.0,24318.0
mean,39.907723,1347.709968,15.765071,0.0,2.76906,41.085945,0.591126
std,11.438238,2944.383929,8.273208,0.0,3.068752,100.49057,1.976166
min,16.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,31.0,75.0,8.0,0.0,1.0,-1.0,0.0
50%,37.0,451.0,16.0,0.0,2.0,-1.0,0.0
75%,48.0,1420.25,21.0,0.0,3.0,-1.0,0.0
max,95.0,81204.0,31.0,0.0,63.0,842.0,58.0


In [8]:
df.describe(include = ['O'])

Unnamed: 0,id,job,marital,education,default,housing,loan,contact,month,poutcome,y
count,24318,24318,24318,24318,24318,24318,24318,24318,24318,24318,24318
unique,24318,12,3,4,2,2,2,3,12,4,2
top,TR7258,JobCat3,married,secondary,no,yes,no,cellular,may,unknown,TypeA
freq,1,5197,14639,12516,23871,13528,20350,15691,7448,19762,21495


In [9]:
df.dtypes

id           object
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

## 2a)  Convert categorical features to numerical format

In [10]:
cat_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

In [40]:
#pass the data to the get dummies function along with the columns needed to convert 
df_with_dummies = pd.get_dummies(df, columns = cat_features )

In [41]:
df_with_dummies.head()

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous,job_JobCat1,job_JobCat10,job_JobCat11,job_JobCat2,job_JobCat3,job_JobCat4,job_JobCat5,job_JobCat6,job_JobCat7,job_JobCat8,job_JobCat9,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y_TypeA,y_TypeB
0,TR1,44,29,5,0,1,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
1,TR2,31,2,5,0,1,-1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
2,TR3,42,2,5,0,1,-1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
3,TR4,58,121,5,0,1,-1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0
4,TR5,43,593,5,0,1,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0


In [12]:
df.to_csv('./data/trainingSetLabelled.csv', index=False)