In [1]:
# import neccessary libraries for splitting
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# import dataframe(df) using pandas
data = pd.read_csv('Housing.csv')

In [3]:
# duplicate the df so it can be modified without affecting the original
housing = data.copy()

In [4]:
# what are some of the details of the housing df
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
# sneeking a peek at the housing df
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [6]:
#let's create 2 new dfs from the housing df --- (consider them as sub dataframes)
        #target - Target(Price) Price is the value we are about to predict
        #features - Features(Any other column apart from price) Features are the input values that we are using to predict the target

target = housing.price
features = housing.drop( columns = 'price')

In [7]:
# See how the new sub dfs look like
print(target.head())
print(features.head())

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64
   area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  7420         4          2        3      yes        no       no   
1  8960         4          4        4      yes        no       no   
2  9960         3          2        2      yes        no      yes   
3  7500         4          2        2      yes        no      yes   
4  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [8]:
# small illustration of how the train_test_split works. 
# first partition goes to first variable , second partition goes to second variable, etc.
x = 5
y = 10
x, y = 5, 10


print(x)
print(y)

5
10


In [9]:
train_test_split(target, test_size = 0.3, random_state = 11)

[331    3920000
 508    2590000
 247    4550000
 291    4200000
 71     6755000
         ...   
 332    3920000
 269    4375000
 337    3920000
 91     6419000
 80     6629000
 Name: price, Length: 381, dtype: int64,
 136    5740000
 184    5110000
 375    3640000
 70     6790000
 104    6195000
         ...   
 221    4767000
 200    4900000
 309    4130000
 121    5950000
 433    3290000
 Name: price, Length: 164, dtype: int64]

In [10]:
#splitting whole dataframe into 70% 30% partitions
#train_test_split will assign anything about test to the second variable. in this case, 30% will be assigned to target_init_test

target_train, target_init_test = train_test_split(target, test_size = 0.3, random_state = 11)

In [11]:
# Was 70% split successful?
target_train.info()
# Results say yes

<class 'pandas.core.series.Series'>
Index: 381 entries, 331 to 80
Series name: price
Non-Null Count  Dtype
--------------  -----
381 non-null    int64
dtypes: int64(1)
memory usage: 6.0 KB


In [12]:
# 30% split
# which will be later split in two 15% parts
target_init_test.info()

<class 'pandas.core.series.Series'>
Index: 164 entries, 136 to 433
Series name: price
Non-Null Count  Dtype
--------------  -----
164 non-null    int64
dtypes: int64(1)
memory usage: 2.6 KB


# Above code was used to split the target data using 70:30 ratio


#

# 

# Subsequent codes will focus on splitting the features in a 70:30
# training: initial test data

In [13]:
# See what the data would look like after split
train_test_split(features, train_size = 0.7, random_state = 11)

[     area  bedrooms  bathrooms  stories mainroad guestroom basement  \
 331  7260         3          2        1      yes       yes      yes   
 508  4400         2          1        1      yes        no       no   
 247  8400         4          1        4      yes        no       no   
 291  2953         3          1        2      yes        no      yes   
 71   6000         4          2        4      yes        no       no   
 ..    ...       ...        ...      ...      ...       ...      ...   
 332  5500         4          1        2      yes       yes      yes   
 269  3900         3          1        2      yes        no       no   
 337  2145         4          2        1      yes        no      yes   
 91   6750         2          1        1      yes       yes      yes   
 80   6000         3          1        2      yes        no       no   
 
     hotwaterheating airconditioning  parking prefarea furnishingstatus  
 331              no              no        3       no      

In [14]:
# Assisgned variables for the expected return columns. Assigned train size to 70%
features_train, features_init_test = train_test_split(features, train_size = 0.7, random_state = 11)

In [15]:
# detail of our expected 70%
features_train.info()
# AFA PAPA!!!

<class 'pandas.core.frame.DataFrame'>
Index: 381 entries, 331 to 80
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   area              381 non-null    int64 
 1   bedrooms          381 non-null    int64 
 2   bathrooms         381 non-null    int64 
 3   stories           381 non-null    int64 
 4   mainroad          381 non-null    object
 5   guestroom         381 non-null    object
 6   basement          381 non-null    object
 7   hotwaterheating   381 non-null    object
 8   airconditioning   381 non-null    object
 9   parking           381 non-null    int64 
 10  prefarea          381 non-null    object
 11  furnishingstatus  381 non-null    object
dtypes: int64(5), object(7)
memory usage: 38.7+ KB


# Splitting the 30s


# target_init_test
# -----validation & target_test



# features_init_test
# -----validation & features_test

In [16]:
# splits the 30% - Target
train_test_split(target_init_test, train_size = 0.5, test_size = 0.5, random_state = 11)

[366    3675000
 86     6510000
 444    3220000
 128    5873000
 174    5250000
         ...   
 188    5075000
 421    3360000
 540    1820000
 162    5460000
 31     8400000
 Name: price, Length: 82, dtype: int64,
 281    4270000
 307    4165000
 483    2940000
 472    3010000
 44     7560000
         ...   
 353    3780000
 104    6195000
 203    4900000
 536    1960000
 39     7910000
 Name: price, Length: 82, dtype: int64]

In [17]:
# Assigned results to validation and test values
target_valid, target_test = train_test_split(target_init_test, train_size = 0.5, test_size = 0.5, random_state = 11)

# Prices - Target
# 70% - Training
# 15% - Validation
# 15% - Testing

# On to the features

In [18]:
# splitting the 30% - Features
train_test_split(features_init_test, train_size = 0.5, test_size = 0.5, random_state = 11)

[     area  bedrooms  bathrooms  stories mainroad guestroom basement  \
 366  3630         2          1        1      yes        no      yes   
 86   6670         3          1        3      yes        no      yes   
 444  3120         3          1        2       no        no       no   
 128  5500         3          1        3      yes       yes       no   
 174  3800         3          1        2      yes       yes      yes   
 ..    ...       ...        ...      ...      ...       ...      ...   
 188  5720         2          1        2      yes        no       no   
 421  4750         2          1        1      yes        no       no   
 540  3000         2          1        1      yes        no      yes   
 162  6600         4          2        2      yes       yes      yes   
 31   7000         3          1        4      yes        no       no   
 
     hotwaterheating airconditioning  parking prefarea furnishingstatus  
 366              no              no        0       no      

In [19]:
# assign to variables
features_valid, features_test = train_test_split(features_init_test, train_size = 0.5, test_size = 0.5, random_state = 11)

# housing - 100% - 545 rows
# training - 70% - 381 rows
# testing - 15% - 82 rows
# validation - 15% - 82 rows

In [20]:
features_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 381 entries, 331 to 80
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   area              381 non-null    int64 
 1   bedrooms          381 non-null    int64 
 2   bathrooms         381 non-null    int64 
 3   stories           381 non-null    int64 
 4   mainroad          381 non-null    object
 5   guestroom         381 non-null    object
 6   basement          381 non-null    object
 7   hotwaterheating   381 non-null    object
 8   airconditioning   381 non-null    object
 9   parking           381 non-null    int64 
 10  prefarea          381 non-null    object
 11  furnishingstatus  381 non-null    object
dtypes: int64(5), object(7)
memory usage: 38.7+ KB


In [21]:
features_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82 entries, 281 to 39
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   area              82 non-null     int64 
 1   bedrooms          82 non-null     int64 
 2   bathrooms         82 non-null     int64 
 3   stories           82 non-null     int64 
 4   mainroad          82 non-null     object
 5   guestroom         82 non-null     object
 6   basement          82 non-null     object
 7   hotwaterheating   82 non-null     object
 8   airconditioning   82 non-null     object
 9   parking           82 non-null     int64 
 10  prefarea          82 non-null     object
 11  furnishingstatus  82 non-null     object
dtypes: int64(5), object(7)
memory usage: 8.3+ KB


In [22]:
features_valid.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82 entries, 366 to 31
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   area              82 non-null     int64 
 1   bedrooms          82 non-null     int64 
 2   bathrooms         82 non-null     int64 
 3   stories           82 non-null     int64 
 4   mainroad          82 non-null     object
 5   guestroom         82 non-null     object
 6   basement          82 non-null     object
 7   hotwaterheating   82 non-null     object
 8   airconditioning   82 non-null     object
 9   parking           82 non-null     int64 
 10  prefarea          82 non-null     object
 11  furnishingstatus  82 non-null     object
dtypes: int64(5), object(7)
memory usage: 8.3+ KB


In [23]:
target_train = np.log1p(target_train).values
target_valid = np.log1p(target_valid).values
target_test = np.log1p(target_test).values

In [24]:
'''
features_test = features_test.values
features_train = features_train.values
features_valid = features_valid.values
'''

'\nfeatures_test = features_test.values\nfeatures_train = features_train.values\nfeatures_valid = features_valid.values\n'

In [25]:
features_train.columns

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom',
       'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea',
       'furnishingstatus'],
      dtype='object')

In [26]:
features_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 381 entries, 331 to 80
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   area              381 non-null    int64 
 1   bedrooms          381 non-null    int64 
 2   bathrooms         381 non-null    int64 
 3   stories           381 non-null    int64 
 4   mainroad          381 non-null    object
 5   guestroom         381 non-null    object
 6   basement          381 non-null    object
 7   hotwaterheating   381 non-null    object
 8   airconditioning   381 non-null    object
 9   parking           381 non-null    int64 
 10  prefarea          381 non-null    object
 11  furnishingstatus  381 non-null    object
dtypes: int64(5), object(7)
memory usage: 38.7+ KB


In [27]:
baseline_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

In [28]:
features_train_bl = features_train[baseline_features]
features_valid_bl = features_valid[baseline_features]

features_train_bl_yi = features_train_bl.values

In [29]:
model = LinearRegression()
model.fit(features_train_bl_yi, target_train)