In [1]:
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split # to divide train and test set

# feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2

In [2]:
cd Dropbox/Portfolio/DataScience-Portfolio/KDD-1998

/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/KDD-1998


In [3]:
# load data
kdd = pd.read_csv('data_reg2.csv')

# generate X and Y for preditions
Y = np.ravel(kdd.TARGET_D)  # to flatten array
X = kdd.drop('TARGET_D', axis = 1)

In [4]:
kdd.shape

(4829, 1914)

### Identify highly correlated features

In [5]:
# correlation computation
def vcorrcoef(X,y):
    Xm = np.mean(X)
    ym = np.mean(y)
    r_num = np.sum((X-Xm)*(y-ym))
    r_den = np.sqrt(np.sum((X-Xm)**2)*np.sum((y-ym)**2))
    r = r_num/r_den
    return r

In [6]:
# try the function
vcorrcoef(kdd['NUMPROM'], kdd['NGIFTALL'])

0.78212633281455313

In [7]:
# identify highly correlated features (>0.7)
correlated_feat = []

for i in range(len(kdd.columns)-1):
    feat = kdd.columns[i]
    for j in range(i+1,len(kdd.columns)):
        feat2 = kdd.columns[j]
        ccf = vcorrcoef(kdd[feat], kdd[feat2])
        if ccf >=0.7 or ccf <= -0.7:
            correlated_feat.append(feat2)

In [8]:
len(correlated_feat)

1193

### Feature engineering

In [9]:
# identify the binary columns
binary_cols = []
for col in X.columns:
    if len(X[col].unique()) == 2:
        if np.sum(X[col].unique() == np.array([0,1])) == 2:
            binary_cols.append(col)
binary_cols[0:5]

['MAILCODE', 'PEPSTRFL', 'maxadate', 'NOEXCH_0', 'NOEXCH_1']

In [10]:
# Identify non binary cols
nonBinary = []
for col in X.columns:
    if col in binary_cols:
        pass
    else:
        nonBinary.append(col)

In [11]:
len(nonBinary), len(binary_cols)

(571, 1342)

## For each non binary column:

1) Generate 4 binary variables segregated by quartiles.

2) Generate the log transformation.

3) Select the best of the 6 against the target variable, and drop the rest.

4) Repeat number 3 for log(target_variable)

In [12]:
# categorisation by quantiles + add log transformation
def feat_eng(col):
    a = False
    try:
        temp = pd.qcut(kdd[col], [0, .25, .5, .75, 1], labels = ['FirstQ','SecondQ','ThirdQ','FourthQ'])
        temp_dummies = pd.get_dummies(temp)
        a = True
    except:
        pass
    
    if a:
        temp_dummies['log'] = np.log(kdd[col]+1)
        temp_dummies['asis'] = kdd[col]
    else:
        temp_dummies = pd.DataFrame(index=kdd.index, columns= ['asis', 'log'])
        temp_dummies['log'] = np.log(kdd[col]+1)
        temp_dummies['asis'] = kdd[col]
        
    return temp_dummies

In [13]:
# try function
var = feat_eng('AGE')
var.head(3)

Unnamed: 0,FirstQ,SecondQ,ThirdQ,FourthQ,log,asis
0,0,1,0,0,4.143135,62
1,0,1,0,0,4.143135,62
2,0,0,1,0,4.204693,66


In [14]:
# try function when not possible to divide in quantiles
var = feat_eng('MDMAUD_I5CM')
var.head(3)

Unnamed: 0,asis,log
0,0,0
1,0,0
2,0,0


In [15]:
# feature selection function
def check_features(Y,features):
    X_train, X_test, Y_train, Y_test = train_test_split(features, Y, test_size=0.5, random_state=42)
    selector = SelectKBest(f_regression, k = 1)
    selector.fit(X_train, Y_train)
    
    pval = pd.Series(selector.pvalues_)
    ind = pval.sort_values().head(1).index
    best_feat = features.columns[ind]
    
    return best_feat

In [16]:
check_features(Y,var)

Index(['asis'], dtype='object')

In [17]:
# select the best feature from the pool of engineered ones

# make empty df
df = pd.DataFrame(index=kdd.index, columns=['nonBinary'])

for col in nonBinary:
    var = feat_eng(col)
    temp = check_features(Y,var)
    col_name = col + '_' + str(temp)[8:11]
    df[col_name] = var[temp]

In [18]:
df.drop('nonBinary', axis = 1, inplace = True)
df.head()

Unnamed: 0,AGE_Fir,INCOME_asi,HIT_asi,MALEMILI_asi,MALEVET_Fir,VIETVETS_Fou,WWIIVETS_log,LOCALGOV_log,STATEGOV_Thi,FEDGOV_Fir,...,RFA_22_S4A_asi,RFA_22_U1G_asi,RFA_2A_D_asi,MDMAUD_R_X_asi,MDMAUD_F_5_asi,MDMAUD_F_X_asi,MDMAUD_A_M_asi,MDMAUD_A_T_asi,MDMAUD_A_X_asi,GEOCODE2_A_asi
0,0,3,10,2,1,1,3.332205,2.484907,1,1,...,0,0,1,1,0,1,0,0,1,1
1,0,4,0,1,0,1,2.833213,2.197225,0,0,...,0,0,1,1,0,1,0,0,1,1
2,0,5,5,0,0,0,3.688879,1.94591,1,1,...,0,0,1,1,0,1,0,0,1,0
3,0,6,0,0,0,0,4.007333,1.098612,0,1,...,0,0,0,1,0,1,0,0,1,1
4,0,1,10,0,1,1,2.197225,1.791759,1,0,...,0,0,0,1,0,1,0,0,1,1


In [19]:
# similar above but use log of target variable
df_log = pd.DataFrame(index=kdd.index, columns=['nonBinary'])
for col in nonBinary:
    var = feat_eng(col)
    temp = check_features(np.log(Y),var)
    col_name = col + '_' + str(temp)[8:11]
    df_log[col_name] = var[temp]

In [20]:
df_log.drop('nonBinary', axis = 1, inplace = True)
df_log.shape

(4829, 571)

In [21]:
new_kdd = pd.concat([df, kdd[binary_cols], pd.DataFrame(Y)], axis = 1)
new_kdd_log = pd.concat([df_log, kdd[binary_cols], pd.DataFrame(np.log(Y))], axis = 1)

new_kdd.to_csv('kdd_reg_fe.csv', header = True, index = False)
new_kdd_log.to_csv('kdd_reg_fe_log.csv', header = True, index = False)

### Repeat the same but removing highly correlated features

In [22]:
kdd.drop(correlated_feat, axis = 1, inplace = True)

In [23]:
kdd.shape

(4829, 1245)

In [24]:
# generate X and Y for preditions
Y = np.ravel(kdd.TARGET_D)  # to flatten array
X = kdd.drop('TARGET_D', axis = 1)

In [25]:
# identify the binary columns
binary_cols = []
for col in X.columns:
    if len(X[col].unique()) == 2:
        if np.sum(X[col].unique() == np.array([0,1])) == 2:
            binary_cols.append(col)
binary_cols[0:5]

['MAILCODE', 'PEPSTRFL', 'maxadate', 'NOEXCH_0', 'NOEXCH_1']

In [26]:
# Identify non binary cols
nonBinary = []
for col in X.columns:
    if col in binary_cols:
        pass
    else:
        nonBinary.append(col)

In [27]:
# select the best feature from the engineered ones
df = pd.DataFrame(index=kdd.index, columns=['nonBinary'])
for col in nonBinary:
    var = feat_eng(col)
    temp = check_features(Y,var)
    col_name = col + '_' + str(temp)[8:11]
    df[col_name] = var[temp]

In [28]:
df.drop('nonBinary', axis = 1, inplace = True)

In [29]:
# similar above but use log of target variable
df_log = pd.DataFrame(index=kdd.index, columns=['nonBinary'])
for col in nonBinary:
    var = feat_eng(col)
    temp = check_features(np.log(Y),var)
    col_name = col + '_' + str(temp)[8:11]
    df_log[col_name] = var[temp]

In [30]:
df_log.drop('nonBinary', axis = 1, inplace = True)

In [31]:
new_kdd = pd.concat([df, kdd[binary_cols], pd.DataFrame(Y)], axis = 1)
new_kdd_log = pd.concat([df_log, kdd[binary_cols], pd.DataFrame(np.log(Y))], axis = 1)

new_kdd.to_csv('kdd_reg_fe_small.csv', header = True, index = False)
new_kdd_log.to_csv('kdd_reg_fe_log_small.csv', header = True, index = False)