In [47]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [48]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import statsmodels.api as sm

In [49]:
# Read the dataset as a Pandas dataframe in memory
dataset_wage = dataiku.Dataset("Madrid_Rental_Dataset_ALLDATA_prepared2")
df = dataset_wage.get_dataframe(limit=100000)

In [50]:
# Get some simple descriptive statistics
pdu.audit(df)

Unnamed: 0,_a_variable,_b_data_type,_c_cardinality,_d_missings,_e_sample_values
0,Type Level,int64,6,0,"[2, 4]"
1,Pricing Level,int64,3,0,"[2, 1]"
2,Rent,int64,253,0,"[1300, 3000]"
3,Bedrooms,int64,9,0,"[2, 5]"
4,Sq.Mt,int64,279,0,"[72, 260]"
5,Floor,float64,28,0,"[3.0, 2.0]"
6,Outer,int64,2,0,"[1, 0]"
7,Elevator,int64,2,0,"[1, 0]"
8,Penthouse,int64,2,0,"[0, 1]"
9,Cottage,int64,2,0,"[0, 1]"


In [51]:
# Stepwise function
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out = 0.1, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.argmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [52]:
# Fills X (removing variables with missing values)
selected_fields=df.drop(labels=["Rent"],axis=1)
# Sets y
y = df['Rent']

result = stepwise_selection(selected_fields, y)

will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


Add  Sq.Mt                          with p-value 0.0
Add  Pricing Level                  with p-value 7.52899e-75
Add  Cottage                        with p-value 2.43764e-23
Add  Floor                          with p-value 1.00896e-08
Add  Type Level                     with p-value 0.00016554
Add  Bedrooms                       with p-value 0.00341287
Add  Outer                          with p-value 0.00902218
Add  Duplex                         with p-value 0.0153292


In [53]:
print('resulting features:')
print(result)

resulting features:
[u'Sq.Mt', u'Pricing Level', u'Cottage', u'Floor', u'Type Level', u'Bedrooms', u'Outer', u'Duplex']
