In [9]:
import pandas as pd


In [10]:
data = pd.read_csv("./data/Housing.csv")
print(f'Rows = {data.shape[0]}\t Cols= {data.shape[1]}')
data.head()

Rows = 545	 Cols= 13


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


# Data cleaning

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


**Inference:**  
1. No NULL values.
2. We have categorical variables too.

# Data Wrangling

Lets look at what kind of values does the features have.

In [12]:
cols = data.columns
uniques = [data[x].unique() for x in cols]

features = pd.DataFrame([[len(x) for x in uniques],uniques],columns=cols, index=['Count','Values']).transpose()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(features)

                 Count                                             Values
price              219  [13300000, 12250000, 12215000, 11410000, 10850...
area               284  [7420, 8960, 9960, 7500, 8580, 16200, 8100, 57...
bedrooms             6                                 [4, 3, 5, 2, 6, 1]
bathrooms            4                                       [2, 4, 1, 3]
stories              4                                       [3, 4, 2, 1]
mainroad             2                                          [yes, no]
guestroom            2                                          [no, yes]
basement             2                                          [no, yes]
hotwaterheating      2                                          [no, yes]
airconditioning      2                                          [yes, no]
parking              4                                       [2, 3, 0, 1]
prefarea             2                                          [yes, no]
furnishingstatus     3           [furn

**Inference:**  
1. We have binary and ordinal categorical variables, will need to encode them.

## Encoding Categorical variables

Label encoding both ordinal and binary categorical variables.

In [13]:
encodes = {
    "mainroad": {
        "yes": 1,
        "no": -1
    },
    "guestroom": {
        "yes": 1,
        "no": -1
    },
    "basement": {
        "yes": 1,
        "no": -1
    },
    "hotwaterheating": {
        "yes": 1,
        "no": -1
    },
    "airconditioning": {
        "yes": 1,
        "no": -1
    },
    "prefarea": {
        "yes": 1,
        "no": -1
    },
    "furnishingstatus": {
        "furnished": 2,
        "semi-furnished": 1,
        "unfurnished": 0
    },
}

In [14]:
for feature, encode in encodes.items():
    data[feature].replace(encode.keys(), encode.values(), inplace=True)
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,-1,-1,-1,1,2,1,2
1,12250000,8960,4,4,4,1,-1,-1,-1,1,3,-1,2
2,12250000,9960,3,2,2,1,-1,1,-1,-1,2,1,1
3,12215000,7500,4,2,2,1,-1,1,-1,1,3,1,2
4,11410000,7420,4,1,2,1,1,1,-1,1,2,-1,2


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   price             545 non-null    int64
 1   area              545 non-null    int64
 2   bedrooms          545 non-null    int64
 3   bathrooms         545 non-null    int64
 4   stories           545 non-null    int64
 5   mainroad          545 non-null    int64
 6   guestroom         545 non-null    int64
 7   basement          545 non-null    int64
 8   hotwaterheating   545 non-null    int64
 9   airconditioning   545 non-null    int64
 10  parking           545 non-null    int64
 11  prefarea          545 non-null    int64
 12  furnishingstatus  545 non-null    int64
dtypes: int64(13)
memory usage: 55.5 KB


In [16]:
cols = data.columns
uniques = [data[x].unique() for x in cols]

features = pd.DataFrame([[len(x) for x in uniques],uniques],columns=cols, index=['Count','Values']).transpose()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(features)

                 Count                                             Values
price              219  [13300000, 12250000, 12215000, 11410000, 10850...
area               284  [7420, 8960, 9960, 7500, 8580, 16200, 8100, 57...
bedrooms             6                                 [4, 3, 5, 2, 6, 1]
bathrooms            4                                       [2, 4, 1, 3]
stories              4                                       [3, 4, 2, 1]
mainroad             2                                            [1, -1]
guestroom            2                                            [-1, 1]
basement             2                                            [-1, 1]
hotwaterheating      2                                            [-1, 1]
airconditioning      2                                            [1, -1]
parking              4                                       [2, 3, 0, 1]
prefarea             2                                            [1, -1]
furnishingstatus     3                

**Inference:**  
1. Everything is in proper numeric format and ready to be used as input for algorithms.
2. The range of values for "area" feature is too large compared to other numerical features. Scaling this feature may improve our perfomance.

## Scaling

Since scaling techniques are largely influenced by outliers let's check if "area" contains any outliers.

# Train-test split

In [17]:
from sklearn.model_selection import train_test_split

FEATURES  = ['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 'basement',
             'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']
TARGET = ['price']

X = data.filter(FEATURES, axis=1)
Y = data.filter(TARGET, axis=1)
print(f'Featuers = {X.shape} \t Target = {Y.shape}')

Featuers = (545, 12) 	 Target = (545, 1)


In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)
print(f'Train X: {x_train.shape} \t Train Y: {y_train.shape}')
print(f'Test X: {x_test.shape} \t Test Y: {y_test.shape}')

Train X: (381, 12) 	 Train Y: (381, 1)
Test X: (164, 12) 	 Test Y: (164, 1)


# Linear Regression

### Model

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Train

In [20]:
model = LinearRegression()
model.fit(x_train, y_train)

### Predict

In [21]:
y_preds = model.predict(x_test)

### Evaluate

In [22]:
print(f"MSE = {mean_squared_error(y_test, y_preds)}")
print(f"precision score = {model.score(x_test, y_preds)}")

MSE = 1535047758428.0496
precision score = 1.0


In [23]:
import matplotlib.pyplot as plt