In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model

In [2]:
#reading data:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Investigating the data:

In [3]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
#Investigating data, "train" csv file:
train.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [5]:
#Investigating data, "test" csv file:
test.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal


In [6]:
#checking the shapes of our data:
print('Train shape = ' + str(train.shape) + '\nTest shape = ' + str(test.shape))#

Train shape = (1460, 81)
Test shape = (1459, 80)


In [7]:
#The SalePrice not included in test:

for ttest in train.columns:
    if ttest not in test.columns:
        print(ttest)
        
len(test.columns), len(train.columns)

SalePrice


(80, 81)

In [8]:
#Check for nulls:   Alley
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

### Data sets contents:

**train:** 
- Has (1460 rows) & (81 columns).
- Of which 19 contain null values.
- extra column "SalePrice(y)"

**test:** 
- Has (1459 rows) & (80 columns).
- Of which 19 contain null values.
- Of which 33 contain null values.



# Cleaning data:

In [9]:
#Check nulls:

null_col = train.columns[train.isna().any()].tolist()
for col in null_col:
    print("Column name : {} \n \
           Number of null : {} \n \
           Unique values: {} ".format(col, train[col].isna().sum(), train[col].value_counts().sum()))

Column name : LotFrontage 
            Number of null : 259 
            Unique values: 1201 
Column name : Alley 
            Number of null : 1369 
            Unique values: 91 
Column name : MasVnrType 
            Number of null : 8 
            Unique values: 1452 
Column name : MasVnrArea 
            Number of null : 8 
            Unique values: 1452 
Column name : BsmtQual 
            Number of null : 37 
            Unique values: 1423 
Column name : BsmtCond 
            Number of null : 37 
            Unique values: 1423 
Column name : BsmtExposure 
            Number of null : 38 
            Unique values: 1422 
Column name : BsmtFinType1 
            Number of null : 37 
            Unique values: 1423 
Column name : BsmtFinType2 
            Number of null : 38 
            Unique values: 1422 
Column name : Electrical 
            Number of null : 1 
            Unique values: 1459 
Column name : FireplaceQu 
            Number of null : 690 
            Unique valu

In [10]:
#Replacing missing objects with NA, and floats with 0:

null_col = train.columns[train.isna().any()].tolist()

for col in null_col:
    if train[col].dtype == "float64":
        train[col].fillna(value=0, inplace=True)
    if train[col].dtype == "object":
        train[col].fillna(value="NA", inplace=True)
    print("Column name : {} \n \
           Number of null : {} \n \
           Unique values: {} \n \
           Dtypes: {}".format(col, train[col].isna().sum(), train[col].value_counts().sum(), train[col].dtype))

Column name : LotFrontage 
            Number of null : 0 
            Unique values: 1460 
            Dtypes: float64
Column name : Alley 
            Number of null : 0 
            Unique values: 1460 
            Dtypes: object
Column name : MasVnrType 
            Number of null : 0 
            Unique values: 1460 
            Dtypes: object
Column name : MasVnrArea 
            Number of null : 0 
            Unique values: 1460 
            Dtypes: float64
Column name : BsmtQual 
            Number of null : 0 
            Unique values: 1460 
            Dtypes: object
Column name : BsmtCond 
            Number of null : 0 
            Unique values: 1460 
            Dtypes: object
Column name : BsmtExposure 
            Number of null : 0 
            Unique values: 1460 
            Dtypes: object
Column name : BsmtFinType1 
            Number of null : 0 
            Unique values: 1460 
            Dtypes: object
Column name : BsmtFinType2 
            Number of null : 0

In [11]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [12]:
#Check nulls again:

null_col = train.columns[train.isna().any()].tolist()
for col in null_col:
    print("Column name : {} \n \
           Number of null : {} \n \
           Unique values: {} ".format(col, train[col].isna().sum(), train[col].value_counts().sum()))

In [83]:
#checking the highest correlation with target(y):
train_dummies = pd.get_dummies(train)
high_corr = [train_dummies.corr()["SalePrice"] > 0.6] or [train_dummies.corr()["SalePrice"] < -0.6]
# list(elem for elem in high_corr if elem == True)
df_ = pd.DataFrame(high_corr)
Trues = []
for col in df_.columns:
    if df_[col].values == True:
        print(df_[col])

SalePrice    True
Name: OverallQual, dtype: bool
SalePrice    True
Name: TotalBsmtSF, dtype: bool
SalePrice    True
Name: 1stFlrSF, dtype: bool
SalePrice    True
Name: GrLivArea, dtype: bool
SalePrice    True
Name: GarageCars, dtype: bool
SalePrice    True
Name: GarageArea, dtype: bool
SalePrice    True
Name: SalePrice, dtype: bool


In [14]:
Take SalePrice from train and assign it to target.
target = train.SalePrice
train.drop("SalePrice",axis=1, inplace=True)
target.head()