In [1]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#shell command
!mkdir -\.kaggle

A subdirectory or file -\.kaggle already exists.


In [3]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

house-prices-advanced-regression-techniques.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

**Now, the first step is to explore our data and find the features useful for our mode**l

In [5]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [7]:
def duplicate_finder(df):
    '''
    check for duplicates and remove if there was any
    '''
    df.drop_duplicates(subset=None, keep= 'first', inplace=True)
    return df

In [8]:
def null_finder1(df):
    '''
    check for columns with more than 90% null value
    '''
    for column in df.columns:
        if df[df[column].isnull()][column].count()/df[column].count() > .5:
            df.drop(column, axis = 1, inplace = True)
    return df

def null_finder2(df):
    '''
    check for columns with less than 25% null value, and fill them with mean of the feature
    '''
    for column in df.columns:
        if is_numeric_dtype(df[column]):
            if df[df[column].isnull()][column].count()/df[column].count() < .1:
                df[column].fillna(value = df[column].mean(), inplace = True)
    return df

def null_finder3(df):
    '''
    Remove the remainding features with null values 
    '''
    df.dropna(inplace = True, axis = 1)
    return df

In [9]:
def categorical_handler(df):
    '''
    This function turn categorical features to numerical ones.
    '''
    cat = df[df.select_dtypes(include = ["object"]).columns]
    num = df[df.select_dtypes(exclude = ["object"]).columns]
    dummy = pd.get_dummies(cat)
    
    df2 = pd.concat([cat,num], axis=1)
    return df2

In [10]:
def categorical_remover(df):
    return df[df.select_dtypes(exclude = ["object"]).columns]

In [11]:
#cleaning the train_set
def cleaner(df):
    df = duplicate_finder(df)
    df = null_finder1(df)
    df = null_finder2(df)
    df = null_finder3(df)
    df = categorical_handler(df)
    df = categorical_remover(df)
    return df

In [12]:
df_train = cleaner(df_train)

In [13]:
df_test = cleaner(df_test)

In [14]:
df_train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [15]:
df_test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,730.0,140,0,0,0,120,0,0,6,2010
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,482.0,212,34,0,0,0,0,0,3,2010
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,470.0,360,36,0,0,0,0,0,6,2010
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,506.0,0,82,0,0,144,0,0,1,2010


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X = df_train.drop('SalePrice', axis = 1)
y = df_train['SalePrice']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scalar = StandardScaler()
scalar.fit(X_train)
scaled_feature = scalar.transform(X_train)
X_train = pd.DataFrame(scaled_feature, columns = X_train.columns)

In [21]:
X_test = scalar.transform(X_test)

In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
lr = LinearRegression()

In [24]:
lr.fit(X_train, y_train)

LinearRegression()

In [25]:
pred = lr.predict(X_test)

In [26]:
from sklearn import metrics

In [27]:
metrics.mean_absolute_error(y_test, pred)

22070.19408202512

In [28]:
metrics.r2_score(y_test, pred)

0.8389558611744601

In [29]:
test = df_test
test = scalar.transform(test)

In [30]:
predictions = lr.predict(test)

In [34]:
len(predictions)

1459

In [71]:
series1 = pd.Series(df_test['Id'])
series2 = pd.Series(predictions, name = 'SalePrice')

In [73]:
df_final = pd.concat([series1, series2], names=['Id', 'SalePrice'],axis = 1)

In [74]:
df_final.set_index('Id', inplace= True)

In [75]:
df_final.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,114679.897839
1462,148630.096131
1463,167681.067256
1464,198885.682765
1465,197841.704951


In [76]:
df_final.to_csv('predictions.csv')