In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv', index_col='index')
df.head()

Unnamed: 0_level_0,BuildingRatingDesc,BuildingClassDesc,StreetAddress,Quadrant,City,State,PostCode,Latitude,Longitude,MinRent,...,Rooms-Basement,Rooms-Double Vanities,Rooms-Satellite TV,Rooms-Linen Closet,Building-Trash Pickup - Door to Door,Building-Pet Care,Building-Renters Insurance Program,Rooms-Yard,Building-Fitness Programs,Note
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2 Star,C,26 Lee Ave,,Takoma Park,MD,209124543,38.979855,-77.006718,635.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,3 Star,C,214 Joshua Dr,,Martinsburg,WV,254044200,39.485398,-77.9507,496.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,3 Star,B,11100 Church St,,Fairfax,VA,22030,38.859164,-77.330435,2255.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,3 Star,C,3401 Pearl Dr,,Suitland,MD,207462127,38.841394,-76.926742,1186.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"???, a community of apartments in Suitland, MD..."
4,3 Star,B,3801 Connecticut Ave NW,NW,Washington,DC,20008,38.939361,-77.060139,1000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,??? offers incredibly spacious studio and one ...


In [2]:
count_row = df.shape[0]  # gives number of row count
count_col = df.shape[1]  # gives number of col count
print(count_row)
print(count_col)

2014
133


In [3]:
rating_to_number = {
    'BuildingRatingDesc': {f'{i} Star': i for i in range(10)},
    'BuildingClassDesc': {'A': 3, 'B': 2, 'C': 1, 'F': 0},
}
df = df.replace(rating_to_number)

def div_cols(df: pd.DataFrame) -> dict:
    df_division = {'number_disc': [], 'number_cont': [], 'string_disc': [], 'string_cont': [], 'binary': []}
    for i in df.columns:
        unique_count = df[i].nunique()
        # Numeric data
        if df[i].dtype=='int64' or df[i].dtype=='float':
            if unique_count <= 2:
                df_division['binary'].append(i)
            elif unique_count <= 10:
                df_division['number_disc'].append(i)
            else:
                df_division['number_cont'].append(i)
        # String data
        elif unique_count <= 10:
            df_division['string_disc'].append(i)
        else:
            df_division['string_cont'].append(i)
    return df_division

dd = div_cols(df)

dd = {k:sorted(v) for k,v in dd.items()}

dd

for f in dd['binary']:
    df[f] = df[f].fillna(value=-1).astype('Int64')

for f in dd['number_disc']:
    df[f] = df[f].fillna(value=-1).astype('Int64')

for f in dd['number_cont']:
    df[f] = df[f].fillna(value=-1)

In [4]:
lasso_features = set([f for f in df.columns if df[f].dtype == 'int64' or df[f].dtype == 'float64'])
lasso_features = list(lasso_features - set(['MaxRent', 'MinRent']))
lasso_features

['AYB',
 'UnitMinSqFt',
 'Longitude',
 'NUMUNITS',
 'UnitMaxSqFt',
 'YR_RMDL',
 'Latitude',
 'GBA',
 'EYB',
 'LotSqFtTotal',
 'STORIES']

In [5]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2)
imputed_train_lasso = df_train[lasso_features]
imputed_test_lasso = df_test[lasso_features]

In [6]:
from fancyimpute import IterativeImputer
def impute(train, method='iterative'):
    #fancy impute removes column names. 
    train_cols = list(train)
    # Use MICE to fill in each row's missing features
    if method == 'iterative':
        train = pd.DataFrame(IterativeImputer(verbose=False).fit_transform(train))
    elif method == 'mean':
        train = train.fillna(train.mean())
    train.columns = train_cols
    return train

Using TensorFlow backend.


In [7]:
from sklearn import ensemble
from sklearn import linear_model
from sklearn import neural_network
def find_regressors(module):
    return {name: getattr(module, name) for name in dir(module) if name.endswith('Regressor')}
rs = find_regressors(ensemble)  
rs

{'AdaBoostRegressor': sklearn.ensemble.weight_boosting.AdaBoostRegressor,
 'BaggingRegressor': sklearn.ensemble.bagging.BaggingRegressor,
 'ExtraTreesRegressor': sklearn.ensemble.forest.ExtraTreesRegressor,
 'GradientBoostingRegressor': sklearn.ensemble.gradient_boosting.GradientBoostingRegressor,
 'RandomForestRegressor': sklearn.ensemble.forest.RandomForestRegressor}

In [8]:
def test_regressor_max(r):
    X_train = imputed_train_lasso
    Y_train = df_train.MaxRent.values
    
    r.fit(X_train, Y_train)
    
    pY_train = r.predict(X_train)
    r_in = np.corrcoef(Y_train, pY_train)[0][1]
    
    
    X_test = imputed_test_lasso
    Y_test = df_test.MaxRent.values
    
    pY_test = r.predict(X_test)
    r_out = np.corrcoef(Y_test, pY_test)[0][1]
    
    return {
        'corrs': {'r_in': r_in, 'r_out': r_out},
        'pYs': {'pY_train': pY_train, 'pY_test': pY_test}
    }

results_max = {}
corrs_max={}
for name, regressor in rs.items():
    try:
        results_max[name] = test_regressor_max(regressor())
        corrs_max[name] = results_max[name]['corrs']
    except:
        results_max[name] = 'error'
        corrs_max[name] = 'error'



In [9]:
pd.DataFrame(corrs_max).T.sort_values('r_out')

Unnamed: 0,r_in,r_out
AdaBoostRegressor,0.779346,0.636221
BaggingRegressor,0.957686,0.683433
RandomForestRegressor,0.9634,0.686328
ExtraTreesRegressor,1.0,0.694909
GradientBoostingRegressor,0.900417,0.713643


In [10]:
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
init_notebook_mode(connected=True)

for name, regressor in rs.items():
    X_train = imputed_train_lasso
    Y_train = df_train.MaxRent.values
    r=regressor()
    r.fit(X_train, Y_train)

    X_test = imputed_test_lasso

    x = df_test['Latitude']
    y = df_test['Longitude']
    z = df_test.MaxRent.values
    real_Y = go.Scatter3d(
        x=x,
        y=y,
        z=z,
        name = 'Real',
        mode='markers',
        marker=dict(
            size=12,
            line=dict(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5
            ),
            opacity=0.8
        )
    )

    x2 = df_test['Latitude']
    y2 = df_test['Longitude']
    z2 = r.predict(X_test)
    predicted_Y = go.Scatter3d(
        x=x2,
        y=y2,
        z=z2,
        name = 'Predicted',
        mode='markers',
        marker=dict(
            color='rgb(127, 127, 127)',
            size=12,
            symbol='circle',
            line=dict(
                color='rgb(204, 204, 204)',
                width=1
            ),
            opacity=0.9
        )

    )
    data = [real_Y, predicted_Y]
    layout = go.Layout(
        title= "Scatter Plot Using " + name,
        scene = dict(
                        xaxis = dict(
                            title='Latitude'),
                        yaxis = dict(
                            title='Longitude'),
                        zaxis = dict(
                            title='Maximum Rent'),),
        margin=dict(
            l=100,
            r=5,
            b=5,
            t=100
        )
    )
    
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename = "Scatter Plot Using " + name)


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.




The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



In [11]:
def test_regressor_min(r):
    X_train = imputed_train_lasso
    Y_train = df_train.MinRent.values
    
    r.fit(X_train, Y_train)
    
    pY_train = r.predict(X_train)
    r_in = np.corrcoef(Y_train, pY_train)[0][1]
    
    
    X_test = imputed_test_lasso
    Y_test = df_test.MinRent.values
    
    pY_test = r.predict(X_test)
    r_out = np.corrcoef(Y_test, pY_test)[0][1]
    
    return {
        'corrs': {'r_in': r_in, 'r_out': r_out},
        'pYs': {'pY_train': pY_train, 'pY_test': pY_test}
    }

results_min = {}
corrs_min={}
for name, regressor in rs.items():
    results_min[name] = test_regressor_min(regressor())
    corrs_min[name] = results_min[name]['corrs']


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



In [12]:
pd.DataFrame(corrs_min).T.sort_values('r_out')

Unnamed: 0,r_in,r_out
AdaBoostRegressor,0.694912,0.591083
BaggingRegressor,0.958864,0.684192
ExtraTreesRegressor,1.0,0.686743
RandomForestRegressor,0.958329,0.698475
GradientBoostingRegressor,0.847173,0.710996


In [13]:
for name, regressor in rs.items():
    X_train = imputed_train_lasso
    Y_train = df_train.MinRent.values
    r=regressor()
    r.fit(X_train, Y_train)

    X_test = imputed_test_lasso

    x = df_test['Latitude']
    y = df_test['Longitude']
    z = df_test.MinRent.values
    real_Y = go.Scatter3d(
        x=x,
        y=y,
        z=z,
        name = 'Real',
        mode='markers',
        marker=dict(
            size=12,
            line=dict(
                color='rgba(217, 217, 217, 0.14)',
                width=0.5
            ),
            opacity=0.8
        )
    )

    x2 = df_test['Latitude']
    y2 = df_test['Longitude']
    z2 = r.predict(X_test)
    predicted_Y = go.Scatter3d(
        x=x2,
        y=y2,
        z=z2,
        name = 'Predicted',
        mode='markers',
        marker=dict(
            color='rgb(127, 127, 127)',
            size=12,
            symbol='circle',
            line=dict(
                color='rgb(204, 204, 204)',
                width=1
            ),
            opacity=0.9
        )

    )
    data = [real_Y, predicted_Y]
    layout = go.Layout(
        title= "Scatter Plot Using " + name,
        scene = dict(
                        xaxis = dict(
                            title='Latitude'),
                        yaxis = dict(
                            title='Longitude'),
                        zaxis = dict(
                            title='Minimum Rent'),),
        margin=dict(
            l=100,
            r=5,
            b=5,
            t=100
        )
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig, filename = "Scatter Plot Using " + name)


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.




The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.

