In [91]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [92]:
train_df = pd.read_csv('./data/regression_train.csv')
test_df = pd.read_csv('./data/regression_test.csv')
concat_df = pd.concat([train_df, test_df])

In [93]:
concat_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8125,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,3,2009,WD,Normal,174000
1,80,RL,75.0,9750,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,10,2006,WD,Normal,135500
2,160,FV,,5105,Pave,,IR2,Lvl,AllPub,FR2,...,0,,,,0,3,2007,WD,Normal,148800
3,20,RL,50.0,9405,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,6,2009,WD,Normal,118000
4,50,RL,78.0,10496,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdWo,,0,1,2007,WD,Normal,143000


In [94]:
concat_df.shape

(1459, 80)

In [95]:
series = concat_df.isnull().sum()
series_filtered = series[series > 0]

### Handle null values

#### 1. threshold for columns (if null values is over 50 % of column)

In [96]:
total_rows = concat_df.shape[0]
threshold = 0.5
series_filtered_mask = (series_filtered / total_rows) > threshold

columns_to_remove = series_filtered[series_filtered_mask].keys()

concat_df.drop(columns_to_remove, inplace=True, axis=1)


print(f'removed {columns_to_remove}')

removed Index(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')


#### 2. Threshold for rows (if row has over 10 missing values)

In [97]:
null_mask =  ~(concat_df.isnull().sum(axis=1) > 10)
filtered_df = concat_df[null_mask]

print(f'removed {concat_df.shape[0] - filtered_df.shape[0]} rows')

removed 6 rows


In [98]:
filtered_df['SaleType']


0      WD
1      WD
2      WD
3      WD
4      WD
       ..
454    WD
455    WD
456    WD
457    WD
458    WD
Name: SaleType, Length: 1453, dtype: object

#### Change nans to most frequent

In [99]:
filtered_df.dtypes.unique()


array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [100]:
mask = filtered_df.dtypes == 'O'
columns_to_string = filtered_df.select_dtypes('O')

for column in columns_to_string:

    most_frequent = filtered_df[column].value_counts().keys()[0]

    filtered_df[column] = filtered_df[column].fillna(most_frequent)
    
    
    



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[column] = filtered_df[column].fillna(most_frequent)


ValueError: Expected 2D array, got 1D array instead:
array=['RL' 'RL' 'FV' ... 'RL' 'RL' 'RL'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [101]:
filtered_df


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8125,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,3,2009,WD,Normal,174000
1,80,RL,75.0,9750,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,10,2006,WD,Normal,135500
2,160,FV,,5105,Pave,IR2,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,3,2007,WD,Normal,148800
3,20,RL,50.0,9405,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,6,2009,WD,Normal,118000
4,50,RL,78.0,10496,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,1,2007,WD,Normal,143000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,20,RL,70.0,7931,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,7,2009,WD,Normal,132500
455,50,RM,50.0,5925,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,5,2009,WD,Abnorml,37900
456,60,RL,,11170,Pave,IR2,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,0,4,2006,WD,Normal,250000
457,20,RL,,16635,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,6,2009,WD,Normal,215000
