In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression ,Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error

In [22]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [23]:
train.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [24]:
test.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,Ready To Move,Brookefield,2 BHK,Roeekbl,1225,2.0,2.0,
1,Plot Area,Ready To Move,Akshaya Nagar,9 Bedroom,,2400,9.0,2.0,
2,Plot Area,18-Apr,Hennur Road,4 Bedroom,Saandtt,1650,5.0,2.0,
3,Super built-up Area,Ready To Move,Kodichikkanahalli,3 BHK,Winerri,1322,3.0,1.0,
4,Super built-up Area,Ready To Move,Konanakunte,2 BHK,AmageSa,1161,2.0,1.0,


In [26]:
train.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [27]:
test.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [28]:
train.shape,test.shape

((13320, 9), (1480, 9))

In [29]:
data = pd.concat([train,test])
data = data.reset_index(drop = True)

In [30]:
data.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
area_type,14800,4.0,Super built-up Area,9736.0,,,,,,,
availability,14800,82.0,Ready To Move,11743.0,,,,,,,
location,14799,1358.0,Whitefield,591.0,,,,,,,
size,14782,32.0,2 BHK,5739.0,,,,,,,
society,8672,2832.0,GrrvaGr,92.0,,,,,,,
total_sqft,14800,2221.0,1200,940.0,,,,,,,
bath,14720,,,,2.69871,1.34393,1.0,2.0,2.0,3.0,40.0
balcony,14122,,,,1.58214,0.819147,0.0,1.0,2.0,2.0,3.0
price,13320,,,,112.566,148.972,8.0,50.0,72.0,120.0,3600.0


In [32]:
data.isna().sum()

area_type          0
availability       0
location           1
size              18
society         6128
total_sqft         0
bath              80
balcony          678
price           1480
dtype: int64

In [64]:
#Percent of data
data.isna().sum()/data.shape[0]

area_type               0.000000
availability            0.000000
location                0.000000
size                    0.001216
society                 0.414054
total_sqft              0.000000
bath                    0.000000
balcony                 0.000000
price                   0.100000
BHK                     0.000000
total_sqft_processed    0.000000
dtype: float64

In [65]:
data['size'].value_counts()

2 BHK         5739
3 BHK         4788
4 Bedroom      918
4 BHK          673
3 Bedroom      613
1 BHK          592
2 Bedroom      368
5 Bedroom      338
6 Bedroom      208
1 Bedroom      122
7 Bedroom       95
8 Bedroom       94
5 BHK           61
9 Bedroom       54
6 BHK           35
7 BHK           18
10 Bedroom      15
1 RK            15
9 BHK           13
8 BHK            8
11 Bedroom       2
11 BHK           2
10 BHK           2
43 Bedroom       1
16 BHK           1
16 Bedroom       1
13 BHK           1
18 Bedroom       1
14 BHK           1
27 BHK           1
19 BHK           1
12 Bedroom       1
Name: size, dtype: int64

In [66]:
data['BHK'] = data['size'].apply(lambda x : str(x).split(' ')[0])

In [67]:
data['BHK'].value_counts()

2      6107
3      5401
4      1591
1       729
5       399
6       243
7       113
8       102
9        67
nan      18
10       17
11        4
16        2
19        1
27        1
13        1
43        1
12        1
18        1
14        1
Name: BHK, dtype: int64

In [68]:
data.loc[data["BHK"]=="nan","BHK"] = np.NAN

In [69]:
data["BHK"].isna().sum()

18

In [70]:
data["total_sqft"].value_counts()

1200            940
1100            240
1500            238
2400            224
600             200
               ... 
3589              1
3045              1
100Sq. Meter      1
1525.84           1
3913              1
Name: total_sqft, Length: 2221, dtype: int64

In [71]:
def check_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [72]:
data[~data["total_sqft"].apply(check_float)]['total_sqft'].value_counts().to_csv("sqft.csv")

In [73]:
data[data["total_sqft"].apply(check_float)]['total_sqft']

0        1056
1        2600
2        1440
3        1521
4        1200
         ... 
14795    1246
14796    1660
14797    1216
14798     996
14799    1150
Name: total_sqft, Length: 14529, dtype: object

In [74]:
def check_float_convert(x):
    tok = x.split("-")
    if (len(tok))==2:
        return (float(tok[0]) + float(tok[1]))/2
    else:
        try:
            return float(x)
        except:
            return np.NAN

In [75]:
data["total_sqft_processed"] = data["total_sqft"].apply(check_float_convert)

In [76]:
data.isna().sum()

area_type                  0
availability               0
location                   0
size                      18
society                 6128
total_sqft                 0
bath                       0
balcony                    0
price                   1480
BHK                       18
total_sqft_processed      48
dtype: int64

In [77]:
data['location'].mode()[0]

'Whitefield'

In [78]:
data['location'] = data['location'].fillna(data['location'].mode()[0])

In [79]:
data['BHK'] = data['BHK'].fillna(data['BHK'].mode()[0])

In [80]:
data['total_sqft_processed'].median()

1280.0

In [81]:
data['total_sqft_processed'] = data['total_sqft_processed'].fillna(data['total_sqft_processed'].median())

In [82]:
data['balcony'] = data['balcony'].fillna(0)

In [83]:
data['bath'] = data['bath'].fillna(0)

In [84]:
data.isna().sum()

area_type                  0
availability               0
location                   0
size                      18
society                 6128
total_sqft                 0
bath                       0
balcony                    0
price                   1480
BHK                        0
total_sqft_processed       0
dtype: int64

In [85]:
def checkAvailabilty(x):
    if x=='Ready To Move':
        return 1
    else:
        return 0
        

In [86]:
data['availability'] = data['availability'].apply(checkAvailabilty)

In [87]:
data['area_type'].value_counts()

Super built-up  Area    9736
Built-up  Area          2688
Plot  Area              2279
Carpet  Area              97
Name: area_type, dtype: int64

In [88]:
data_sel = data[['area_type','availability','location','BHK','total_sqft_processed','bath','balcony','price']]

In [89]:
data_sel['BHK'] = data_sel['BHK'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sel['BHK'] = data_sel['BHK'].astype('int')


In [90]:
num_col = data_sel.select_dtypes(include=np.number)

In [91]:
cat_col = data_sel.select_dtypes(exclude=np.number)

In [92]:
cat_col_processed = pd.get_dummies(cat_col)

In [93]:
scale = StandardScaler()
num_col_scaled = pd.DataFrame(scale.fit_transform(num_col.drop(columns="price")),columns=num_col.drop(columns="price").columns)

In [94]:
data_prerocessed = pd.concat([cat_col_processed,num_col_scaled, num_col['price'] ],axis = 1)

In [97]:
train_preprocessed = data_prerocessed[data_prerocessed['price'].notna()]

In [98]:
test_preprocessed =  data_prerocessed[data_prerocessed['price'].isna()]

In [99]:
train_preprocessed

Unnamed: 0,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,location_ Anekal,location_ Banaswadi,location_ Basavangudi,location_ Bhoganhalli,location_ Devarabeesana Halli,location_ Devarachikkanahalli,...,"location_white field,kadugodi",location_whitefiled,"location_yelahanka, north",location_yettagodi Road,availability,BHK,total_sqft_processed,bath,balcony,price
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,-0.623383,-0.417921,-0.504970,-0.588650,39.07
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.914167,0.868447,1.709415,1.721311,120.00
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.145392,-0.097995,-0.504970,1.721311,62.00
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,0.145392,-0.030511,0.233159,-0.588650,95.00
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,-0.623383,-0.297949,-0.504970,-0.588650,51.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13315,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,1.682943,1.579115,0.971287,-1.743630,231.00
13316,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,0.914167,1.701587,1.709415,-1.743630,400.00
13317,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,-0.623383,-0.347104,-0.504970,-0.588650,60.00
13318,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,0.914167,2.608876,0.971287,-0.588650,488.00


In [100]:
test_preprocessed

Unnamed: 0,area_type_Built-up Area,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,location_ Anekal,location_ Banaswadi,location_ Basavangudi,location_ Bhoganhalli,location_ Devarabeesana Halli,location_ Devarachikkanahalli,...,"location_white field,kadugodi",location_whitefiled,"location_yelahanka, north",location_yettagodi Road,availability,BHK,total_sqft_processed,bath,balcony,price
13320,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,-0.623383,-0.277120,-0.504970,0.566331,
13321,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0.0,4.758044,0.701819,4.661928,0.566331,
13322,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.914167,0.076964,1.709415,0.566331,
13323,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,0.145392,-0.196306,0.233159,-0.588650,
13324,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,-0.623383,-0.330441,-0.504970,-0.588650,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14795,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,-0.623383,-0.259624,-0.504970,-0.588650,
14796,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,0.145392,0.085295,0.233159,0.566331,
14797,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.0,-0.623383,-0.284619,-0.504970,0.566331,
14798,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,-0.623383,-0.467909,-0.504970,-0.588650,


In [101]:
train_preproceed.to_csv("train_preprocessed.csv", index = False)
test_preproceed.to_csv("test_preproceed.csv", index = False)