In [25]:
import pandas as pd
import numpy as np
import sklearn
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from IPython.display import display
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

from scipy.stats import skew
from scipy.special import boxcox1p

%load_ext autoreload
import matplotlib.pyplot as plt
from os import listdir
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
def get_cat_columns_by_type(df):
    out = []
    for colname, col_values in df.items():
        if is_string_dtype(col_values):
            out.append((colname,'string') )
        elif not is_numeric_dtype(col_values):
            out.append((colname,'categorical') )
    return out
       
def get_missing_values_percentage(df):
    missing_values_counts_list = df.isnull().sum()
    total_values = np.product(df.shape)
    total_missing = missing_values_counts_list.sum()
    # percent of data that is missing
    return (total_missing/total_values) * 100

def get_missing_columns(df1,df2):
    missing1 = []
    missing2 = []
    for colname in df1.columns:
        if colname not in df2.columns:
            missing2.append(colname)
    for colname in df2.columns:
        if colname not in df1.columns:
            missing1.append(colname)        
    return (missing1,missing2)


def handle_missing_values(df_in, na_dict=None, inplace=True):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
 
    if na_dict is None:
        na_dict = {}

    for colname, col_values in df.items():
        if colname in na_dict:
            if na_dict[colname] == 'drop_rows':
                df.dropna(subset=[colname], inplace=True)
                continue
            elif na_dict[colname] == 'drop_col':
                df.drop(colname, axis=1, inplace=True)
                continue

        if is_numeric_dtype(col_values):
            if pd.isnull(col_values).sum():
                df[colname+'_na'] = pd.isnull(col_values)
                filler = na_dict[colname] if colname in na_dict else col_values.median()
                df[colname] = col_values.fillna(filler)
                na_dict[colname] = filler
    return (df,na_dict)





In [43]:
PATH = "iowa/"

In [44]:
!dir {PATH}

Invalid switch - "".


In [45]:
listdir(PATH)

['sample_submission1.csv', 'test.csv', 'train.csv']

In [46]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
df = df_raw.copy()
df_id = df['Id']
test_id = df_test['Id']



In [47]:
df.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)
df = df.drop(df[(df['GrLivArea']>4000) & (df['SalePrice']<300000)].index)
print(df.shape,df_test.shape)

ntrain = df.shape[0]
ntest = df_test.shape[0]


(1458, 80) (1459, 79)


In [48]:
combined = pd.concat((df, df_test)).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [49]:
y = df['SalePrice']
y = np.log1p(y)

In [50]:
combined.shape

(2917, 80)

In [51]:
combined['TotalArea'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']
combined["TotalSF"] = combined["GrLivArea"] + combined["TotalBsmtSF"]
combined.shape,df.shape,df_test.shape,y.shape

((2917, 82), (1458, 80), (1459, 79), (1458,))

In [52]:
cor = combined.corr()
cor["SalePrice"].sort_values(ascending=False)

SalePrice        1.000000
TotalArea        0.832877
TotalSF          0.829042
OverallQual      0.795774
GrLivArea        0.734968
TotalBsmtSF      0.651153
GarageCars       0.641047
1stFlrSF         0.631530
GarageArea       0.629217
FullBath         0.562165
TotRmsAbvGrd     0.537769
YearBuilt        0.523608
YearRemodAdd     0.507717
GarageYrBlt      0.487156
MasVnrArea       0.482719
Fireplaces       0.469862
BsmtFinSF1       0.409384
LotFrontage      0.370584
WoodDeckSF       0.324758
OpenPorchSF      0.321142
2ndFlrSF         0.320532
HalfBath         0.284590
LotArea          0.268179
BsmtFullBath     0.228459
BsmtUnfSF        0.214460
BedroomAbvGr     0.168245
ScreenPorch      0.111415
PoolArea         0.099490
MoSold           0.046124
3SsnPorch        0.044568
BsmtFinSF2      -0.011422
BsmtHalfBath    -0.016881
MiscVal         -0.021203
LowQualFinSF    -0.025625
YrSold          -0.028882
OverallCond     -0.077948
MSSubClass      -0.084276
EnclosedPorch   -0.128646
KitchenAbvGr

In [53]:
combined = combined.drop('SalePrice',axis=1)

In [54]:
numeric_feats = combined.dtypes[combined.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = combined[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    combined[feat] = boxcox1p(combined[feat], lam)
    
#combined[skewed_features] = np.log1p(combined[skewed_features])

#skewness.head(10)


Skew in numerical features: 

There are 38 skewed numerical features to Box Cox transform


In [55]:
combined.isna().sum().sort_values(ascending=False)

PoolQC           2908
MiscFeature      2812
Alley            2719
Fence            2346
FireplaceQu      1420
LotFrontage       486
GarageCond        159
GarageFinish      159
GarageQual        159
GarageYrBlt       159
GarageType        157
BsmtCond           82
BsmtExposure       82
BsmtQual           81
BsmtFinType2       80
BsmtFinType1       79
MasVnrType         24
MasVnrArea         23
MSZoning            4
BsmtFullBath        2
BsmtHalfBath        2
Functional          2
Utilities           2
Electrical          1
TotalArea           1
KitchenQual         1
Exterior2nd         1
Exterior1st         1
TotalSF             1
GarageCars          1
                 ... 
OverallCond         0
YrSold              0
YearRemodAdd        0
YearBuilt           0
WoodDeckSF          0
TotRmsAbvGrd        0
Street              0
ScreenPorch         0
SaleCondition       0
RoofStyle           0
RoofMatl            0
PoolArea            0
PavedDrive          0
OverallQual         0
OpenPorchS

In [56]:

combined['MSSubClass'] = combined['MSSubClass'].apply(str)
combined['OverallCond'] = combined['OverallCond'].astype(str)
combined['YrSold'] = combined['YrSold'].astype(str)
combined['MoSold'] = combined['MoSold'].astype(str)
combined['YearBuilt'] = combined['YearBuilt'].astype(str)
combined['YearRemodAdd'] = combined['YearRemodAdd'].astype(str)

cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(combined[c].values)) 
    combined[c] = lbl.transform(list(combined[c].values))

combined = pd.get_dummies(combined)
combined,_ = handle_missing_values(combined,inplace=False)
get_missing_values_percentage(combined)

NameError: name 'is_numeric_dtype' is not defined

In [57]:
df = combined[:ntrain]
df_test = combined[ntrain:]
df.shape,df_test.shape

((1458, 398), (1459, 398))

In [18]:
X_train,X_valid,y_train,y_valid = train_test_split(df,y,test_size=0.15,
                                  stratify=df['OverallQual'],shuffle = True,random_state=20)


In [19]:
scaler = RobustScaler().fit(X_train)
X_train_robust = scaler.transform(X_train)
X_valid_robust = scaler.transform(X_valid)

In [20]:
lasso_model = Lasso(alpha =0.0005,max_iter=100000,random_state=100)
lasso_model.fit(X_train_robust,y_train)
predictions = lasso_model.predict(X_valid_robust)
print(mean_squared_error(np.log(y_valid),np.log(predictions)))
print("R2 score: {:.2f}".format(r2_score(y_valid,predictions)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
def plot_pca_accuracy(model,train,test,ytrain,ytest,scorer,crange=None,
                      scaler=None):
    components = []
    scores = []
    if scaler is None:
        scaler_ = RobustScaler()
    else:
        scaler_ = scaler
        
    for c in range(crange[0],crange[1],crange[2]):
        pca = PCA(n_components=c,whiten=True, random_state=0).fit(train)
        Xt = pca.transform(train)
        Xv = pca.transform(test)
        scaler_ = scaler_.fit(Xt)
        Xtrain_transformed = scaler_.transform(Xt)
        Xvalid_transformed = scaler_.transform(Xv)
        model.fit(Xtrain_transformed,ytrain)
        preds = model.predict(Xvalid_transformed)
        if scorer is not None:
            score = scorer(ytest,preds)
        else:
            score = model.score(ytest,preds)
            
        scores.append(score)
        components.append(c)
    
    plt.plot(components,scores,label="train accuracy")
    plt.ylabel("Error")
    plt.xlabel("Components")
    
plot_pca_accuracy(Lasso(alpha =0.0005,max_iter=100000,random_state=100),
                  X_train,X_valid,y_train,y_valid,mean_squared_error,
                  (90,200,2))



In [None]:
pca = PCA(n_components=144, whiten=True, random_state=0).fit(X_train)
X_train_pca = pca.transform(X_train)
X_valid_pca = pca.transform(X_valid)

In [None]:
pca.components_.shape


In [None]:
pca.explained_variance_ratio_[2]

In [None]:
cumsum = np.cumsum(pca.explained_variance_ratio_)
dimensions = np.argmax(cumsum >= 0.99) + 1
dimensions

In [21]:
lasso_model_pca = Lasso(alpha =0.0005,max_iter=100000,random_state=100)
lasso_model_pca.fit(X_train_pca,y_train)
predictions = lasso_model_pca.predict(X_valid_pca)
print(mean_squared_error(np.log(y_valid),np.log(predictions)))
print("R2 score: {:.2f}".format(r2_score(y_valid,predictions)))

NameError: name 'X_train_pca' is not defined

In [None]:
df_test_pca = pca.transform(df_test)
predictions = np.expm1(lasso_model_pca.predict(df_test_pca))
df_submit = pd.DataFrame({'Id':test_id, 'SalePrice': pd.Series(predictions)},
              columns=['Id', 'SalePrice'])
df_submit.head()

In [22]:
df_submit.to_csv('mysubmission.csv',index=False,float_format='%.6f')

NameError: name 'df_submit' is not defined