In [1]:
import pandas as pd
import numpy as np
import numpy
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.shape

(1460, 81)

In [4]:
print ("Train data shape:", train.shape)
print ("Test data shape:", test.shape)

Train data shape: (1460, 81)
Test data shape: (1459, 80)


In [5]:
def get_na_columns(data):
    total_na = data.isnull().sum(axis = 0)
    ser = total_na[total_na.values != 0]
    df = pd.DataFrame({'col_name':ser.index, '*# na values*':ser.values})
    return df

In [6]:
def random_impute(data, col_name):
    ind_na = data[pd.isnull(data[col_name]) == True].index
    new_vals = data[col_name].dropna().sample(n=len(ind_na), 
                                              replace=True, 
                                              random_state=0).values[0:len(ind_na)]
    data.loc[ind_na,col_name] = new_vals

In [7]:
def mean_impute(data, col_name):
    ind_na = data[pd.isnull(data[col_name]) == True].index
    new_vals = [data[col_name].dropna().mean()] * ind_na.size
    data.loc[ind_na,col_name] = new_vals

In [8]:
def clean_data(data):
    total_na = data.isnull().sum(axis = 0)
    list_of_na_cols = total_na[total_na.values != 0].index
    
    # Obtaining lists of numerical and categorical columns
    numerical_cols = set(data._get_numeric_data().columns)
    categorical_cols = set(data.columns) - numerical_cols
    
    # imputing categorical na columns
    for col_name in list_of_na_cols:
        if (col_name in list_of_na_cols) & (col_name in categorical_cols):
            random_impute(data, col_name)
            
    # imputing numerical na columns
    for col_name in list_of_na_cols:
        if (col_name in list_of_na_cols) & (col_name in numerical_cols):
            mean_impute(data, col_name)

In [9]:
y = train.SalePrice.values
#y = np.log(y)

In [10]:
train.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'Id', 'SalePrice'], axis = 1, inplace=True)
test.drop(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'Id'], axis = 1, inplace=True)

In [11]:
clean_data(train)
clean_data(test)

In [12]:
print(get_na_columns(train))
print(get_na_columns(test))

Empty DataFrame
Columns: [col_name, *# na values*]
Index: []
Empty DataFrame
Columns: [col_name, *# na values*]
Index: []


In [13]:
#train.head()

In [14]:
numerical_cols = set(train._get_numeric_data().columns)
categorical_cols = set(train.columns) - numerical_cols

In [15]:
print(train.shape)
print(test.shape)

(1460, 74)
(1459, 74)


In [16]:
print(train['Utilities'].unique())
print(test['Utilities'].unique())

['AllPub' 'NoSeWa']
['AllPub']


In [17]:
train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [18]:
#q1 = pd.get_dummies(train['Utilities'])
#q2 = pd.get_dummies(test['Utilities'])
#for w in q1:
#    if w in q2:
        #print(q1[w])

In [19]:
# i is used as same values can be in different 
# categorical columns, but dataframe cannot contain 
# same column names
def one_hot_enc(train_data, test_data, col_name, i):
    one_hot = pd.get_dummies(data[col_name])
    one_hot.rename(columns=lambda x: x + str(i), inplace=True)
    data.drop(col_name, axis = 1, inplace = True)
    data = data.join(one_hot)
    return data

In [20]:
# i is used as same values can be in different 
# categorical columns, but dataframe cannot contain 
# same column names
def one_hot_enc2(train_data, test_data, col_name, i):
    one_hot_train = pd.get_dummies(train_data[col_name])
    one_hot_test  = pd.get_dummies(test_data[col_name])
 
    one_hot_train.rename(columns=lambda x: x + str(i), inplace=True)
    train_data.drop(col_name, axis = 1, inplace = True)
    
    one_hot_test.rename(columns=lambda x: x + str(i), inplace=True)
    test_data.drop(col_name, axis = 1, inplace = True)
    
    common_cols = []
    for col in one_hot_train:
        if col in one_hot_test:
            common_cols.append(col)
            
    for col in common_cols:
        train_data = train_data.join(one_hot_train[col])
        test_data = test_data.join(one_hot_test[col])
    
    return train_data, test_data

In [21]:
i = 1
for col_name in categorical_cols:
    train, test = one_hot_enc2(train, test, col_name, i)
    i += 1

In [22]:
train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,New37,Oth37,WD37,1.5Fin38,1.5Unf38,1Story38,2.5Unf38,2Story38,SFoyer38,SLvl38
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,1,0,0,0,0,1,0,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,1,0,0,1,0,0,0,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,1,0,0,0,0,1,0,0
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,1,0,0,0,0,1,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,1,0,0,0,0,1,0,0


In [23]:
#y = train.SalePrice.values
#y = np.log(y)
#x = train.drop(['SalePrice', 'Id'], axis=1).values
x = train.values
#x_test = test.drop(['Id'], axis=1).values
x_test = test.values

In [24]:
import featuretools as ft

In [25]:
combi = train.append(test, ignore_index=True)

In [26]:
combi['id'] = combi.index.values

In [27]:
combi.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,Oth37,WD37,1.5Fin38,1.5Unf38,1Story38,2.5Unf38,2Story38,SFoyer38,SLvl38,id
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,1,0,0,0,0,1,0,0,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,1,0,0,1,0,0,0,0,1
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,1,0,0,0,0,1,0,0,2
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,1,0,0,0,0,1,0,0,3
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,1,0,0,0,0,1,0,0,4


In [28]:
es = ft.EntitySet(id = 'data')

In [29]:
es.entity_from_dataframe(entity_id = 'ent1', dataframe = combi, index = 'id')

Entityset: data
  Entities:
    ent1 [Rows: 2919, Columns: 255]
  Relationships:
    No relationships

In [30]:
#ft.list_primitives()[ft.list_primitives()['type']=='transform']
#['percentile','cum_min','cum_sum','cum_max','multiply','divide','cum_mean','mod','diff','cum_count','add','subtract']

In [31]:
def get_del_cols(df, treshold = 3):
    cols_to_del = []
    for i in range(df.shape[1]):
        if sum(df.iloc[:,i].values == 0) <= treshold:
            cols_to_del.append(i)
    return cols_to_del

In [32]:
q = get_del_cols(combi, 10)

In [33]:
sum(combi.iloc[:,1].values < 0.01)

0

In [34]:
%%time
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'ent1', 
trans_primitives = ['multiply'],#,'divide','add','subtract'],
max_depth = 2, 
verbose = 1, 
n_jobs = 3)

Built 32385 features
EntitySet scattered to workers in 2.825 seconds
Elapsed: 25:37 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks 


Exception ignored in: <generator object add_client at 0x0000018D8BFC6A98>
RuntimeError: generator ignored GeneratorExit


Wall time: 26min 22s


In [64]:
%%time
feature_matrix4, feature_names4 = ft.dfs(entityset=es, 
target_entity = 'ent1', 
trans_primitives = ['divide'],
max_depth = 2, 
verbose = 1, 
n_jobs = 4)

Built 64516 features
EntitySet scattered to workers in 4.639 seconds


Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/11 chunks

Elapsed: 28:09 | Remaining: 4:41:35 | Progress:   9%|▉         | Calculated: 1/11 chunks

Elapsed: 28:17 | Remaining: 2:57:45 | Progress:  18%|█▊        | Calculated: 2/11 chunks

Elapsed: 28:25 | Remaining: 1:36:55 | Progress:  36%|███▋      | Calculated: 4/11 chunks

Elapsed: 56:02 | Remaining: 1:47:52 | Progress:  45%|████▌     | Calculated: 5/11 chunks

Elapsed: 56:12 | Remaining: 1:03:09 | Progress:  55%|█████▍    | Calculated: 6/11 chunks

Elapsed: 56:22 | Remaining: 26:36 | Progress:  73%|███████▎  | Calculated: 8/11 chunks  

Elapsed: 1:17:08 | Remaining: 24:52 | Progress:  82%|████████▏ | Calculated: 9/11 chunks

Elapsed: 1:17:54 | Remaining: 08:56 | Progress:  91%|█████████ | Calculated: 10/11 chunks

Elapsed: 1:17:56 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks

Wall time: 1h 19min 57s


In [67]:
import winsound 

winsound.PlaySound('sound.mp3',winsound.SND_FILENAME)

In [45]:
asdasdsdf SAVE to excel

NameError: name 'asdasdsdf' is not defined

In [51]:
feature_matrix.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,NPkVill14 * Po19,2fmCon7 * AsbShng13,2Types31 * IR110,MiscVal * Twnhs7,Blmngtn14 * BsmtHalfBath,BrkFace13 * GrLivArea,CemntBd13 * Y30,Fireplaces * KitchenAbvGr,Pave24 * Typ16,Grvl24 * NPkVill14
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,0,0,0,0.0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,0,0.0,0,0,1,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,0,0,0,0.0,0,0,1,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,0,0.0,0,0,1,1,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,0,0,0.0,0,0,1,1,0


In [52]:
feature_matrix.to_csv('fm1.csv', index=False)

In [53]:
feature_matrix2.to_csv('fm2.csv', index=False)

In [61]:
feature_matrix3.to_csv('fm3.csv', index=False)

In [65]:
feature_matrix4.to_csv('fm4.csv', index=False)

In [56]:
np.savetxt("fn1.csv", feature_names, delimiter=",", fmt='%s')

In [57]:
np.savetxt("fn2.csv", feature_names2, delimiter=",", fmt='%s')

In [62]:
np.savetxt("fn3.csv", feature_names3, delimiter=",", fmt='%s')

In [66]:
np.savetxt("fn4.csv", feature_names4, delimiter=",", fmt='%s')

In [48]:
feature_matrix2.head(10)

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,NPkVill14 + Po19,2fmCon7 + AsbShng13,2Types31 + IR110,MiscVal + Twnhs7,Blmngtn14 + BsmtHalfBath,BrkFace13 + GrLivArea,CemntBd13 + Y30,Fireplaces + KitchenAbvGr,Pave24 + Typ16,Grvl24 + NPkVill14
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,0,0,0,0.0,1710,1,1,2,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,0,1.0,1262,1,2,2,0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,0,1,0,0.0,1786,1,2,2,0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,0,1,0,0.0,1717,1,2,2,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,1,0,0.0,2198,1,2,2,0
5,50,85.0,14115,5,5,1993,1995,0.0,732.0,0.0,...,0,0,1,700,0.0,1362,1,1,2,0
6,20,75.0,10084,8,5,2004,2005,186.0,1369.0,0.0,...,0,0,0,0,0.0,1694,1,2,2,0
7,60,70.049958,10382,7,6,1973,1973,240.0,859.0,32.0,...,0,0,1,350,0.0,2090,1,3,2,0
8,50,51.0,6120,7,5,1931,1950,0.0,0.0,0.0,...,0,0,0,0,0.0,1775,1,4,1,0
9,190,50.0,7420,5,6,1939,1950,0.0,851.0,0.0,...,0,1,0,0,0.0,1077,1,4,2,0


In [50]:
feature_names2

[<Feature: MSSubClass>,
 <Feature: LotFrontage>,
 <Feature: LotArea>,
 <Feature: OverallQual>,
 <Feature: OverallCond>,
 <Feature: YearBuilt>,
 <Feature: YearRemodAdd>,
 <Feature: MasVnrArea>,
 <Feature: BsmtFinSF1>,
 <Feature: BsmtFinSF2>,
 <Feature: BsmtUnfSF>,
 <Feature: TotalBsmtSF>,
 <Feature: 1stFlrSF>,
 <Feature: 2ndFlrSF>,
 <Feature: LowQualFinSF>,
 <Feature: GrLivArea>,
 <Feature: BsmtFullBath>,
 <Feature: BsmtHalfBath>,
 <Feature: FullBath>,
 <Feature: HalfBath>,
 <Feature: BedroomAbvGr>,
 <Feature: KitchenAbvGr>,
 <Feature: TotRmsAbvGrd>,
 <Feature: Fireplaces>,
 <Feature: GarageYrBlt>,
 <Feature: GarageCars>,
 <Feature: GarageArea>,
 <Feature: WoodDeckSF>,
 <Feature: OpenPorchSF>,
 <Feature: EnclosedPorch>,
 <Feature: 3SsnPorch>,
 <Feature: ScreenPorch>,
 <Feature: PoolArea>,
 <Feature: MiscVal>,
 <Feature: MoSold>,
 <Feature: YrSold>,
 <Feature: Fa1>,
 <Feature: Gd1>,
 <Feature: Po1>,
 <Feature: TA1>,
 <Feature: BrkCmn2>,
 <Feature: BrkFace2>,
 <Feature: None2>,
 <Feature:

In [None]:
learning_rate = np.arange(0.1,0.3,0.1)
n_estimators = np.arange(100,500,50)
max_depth = np.arange(2, 6, 1)
min_samples_split = [2]
#min_samples_split = [3]
min_samples_leaf = [1]
#min_samples_leaf = [3]
max_features = ['auto', 'sqrt', 1/3]

In [None]:
params = []
for lr in learning_rate:
    for n in n_estimators:
        for depth in max_depth:    
            for split in min_samples_split:
                for leaf in min_samples_leaf:
                    for mf in max_features:
                        params.append({'learning_rate' : lr,
                                       'n_estimators' : n, 
                                       'max_depth' : depth,
                                       'min_samples_split' : split,
                                       'min_samples_leaf' : leaf,
                                       'max_features' : mf})
# n_splits -> n_folds; random_state -> seed
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)

In [None]:
def rand_for(params):
    learning_rate = params['learning_rate']
    n_estimators = params['n_estimators']
    max_depth = params['max_depth']
    min_samples_split = params['min_samples_split']
    min_samples_leaf = params['min_samples_leaf']
    max_features = params['max_features']
    
    rf = GradientBoostingRegressor( learning_rate=learning_rate,
                                    n_estimators=n_estimators, 
                                    max_depth=max_depth, 
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    max_features=max_features
    )
    #accuracy = numpy.mean(numpy.sqrt(cross_val_score(rf, x, y, cv = skf, scoring='neg_mean_squared_log_error')))
    accuracy = []
    for train_index, test_index in kf.split(x, y):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rf.fit(x_train, y_train)
        y_pred = rf.predict(x_test)
        accuracy.append(numpy.sqrt(numpy.mean((numpy.log(1+y_pred) - numpy.log(1+y_test))**2)))
    accuracy = numpy.mean(accuracy)
    output = []
    output.append({'l_rate' : learning_rate,
                   'n_est' : n_estimators, 
                   'max_dep' : max_depth, 
                   'min_sam_split' : min_samples_split,
                   'min_sam_leaf' : min_samples_leaf,
                   'max_feat' : max_features,
                   'accuracy' : round(accuracy, 5)})
    return(output)

In [None]:
for i in params:
    q = rand_for(i)
    print(q)

In [None]:
import ipyparallel as ipp
c = ipp.Client(profile='default')
from ipyparallel import Client
cluster = Client(profile='default')
lb_view = cluster.load_balanced_view()
# Sanity check
print(cluster.profile) # cluster name
print(len(lb_view))    # number of cores

In [None]:
# All libraries used in parallel should be shared to all cores
with cluster[:].sync_imports():
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_val_score
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_log_error
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import KFold
    from sklearn.ensemble import GradientBoostingRegressor

# All common data used in parallel should be shared to all cores
shared_dict = {'x' : x,
               'y' : y,
               'skf' : skf,
               'kf' : kf}
cluster[:].push(shared_dict)

In [None]:
%%time
results = lb_view.map(rand_for, params).get()

In [None]:
for i in range(0,len(results)):
    for j in range(0,len(results[0])):
        print(results[i][j])
    print()

In [None]:
#rf = RandomForestRegressor(n_estimators=80, 
#                                    criterion="mse",
#                                    max_depth=None,
#                                    min_samples_split=3,
#                                    min_samples_leaf=2,
#                                    max_features=1/3,
#                                    n_jobs=-1)

rf = GradientBoostingRegressor( learning_rate=0.1,
                                n_estimators=150, 
                                max_depth=5, 
                                min_samples_split=2,
                                min_samples_leaf=1,
                                max_features=1/3
)

rf_fit = rf.fit(x,y)

In [None]:
predictions = rf_fit.predict(x_test)

In [None]:
predictions = np.exp(predictions)

In [None]:
train.columns

In [None]:
test.columns

In [None]:
predictions

In [None]:
y

In [None]:
from sklearn.metrics import mean_squared_error
print ('RMSE is: \n', mean_squared_error(y, predictions))

In [None]:
submission = pd.DataFrame()
submission['Id'] = test.Id

In [None]:
#feats = test.select_dtypes(
#        include=[np.number]).drop(['Id'], axis=1).interpolate()

In [None]:
#predictions = model.predict(feats)

In [None]:
#final_predictions = np.exp(predictions)

In [None]:
submission['SalePrice'] = predictions
submission.head()

In [None]:
#submission.to_csv('submission2.csv', index=False)