In [1]:
cd drive/My Drive/Colab Notebooks/ineuron/houseRent/

/content/drive/My Drive/Colab Notebooks/ineuron/houseRent


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(rc={'figure.figsize':(15,5)})
from tqdm import tqdm

In [3]:
df = pd.read_csv('housing_train.csv')
df.head(2)

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state
0,7039061606,https://bham.craigslist.org/apa/d/birmingham-h...,birmingham,https://bham.craigslist.org,1195,apartment,1908,3,2.0,1,1,1,0,0,0,laundry on site,street parking,https://images.craigslist.org/00L0L_80pNkyDeG0...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
1,7041970863,https://bham.craigslist.org/apa/d/birmingham-w...,birmingham,https://bham.craigslist.org,1120,apartment,1319,3,2.0,1,1,1,0,0,0,laundry on site,off-street parking,https://images.craigslist.org/00707_uRrY9CsNMC...,Find Your Way to Haven Apartment Homes Come ho...,33.3755,-86.8045,al


In [4]:
print("Number of rows in data::",df.shape[0])
print("Number of features::",df.shape[1])
print("Feature names ::",df.columns.values)

Number of rows in data:: 265190
Number of features:: 22
Feature names :: ['id' 'url' 'region' 'region_url' 'price' 'type' 'sqfeet' 'beds' 'baths'
 'cats_allowed' 'dogs_allowed' 'smoking_allowed' 'wheelchair_access'
 'electric_vehicle_charge' 'comes_furnished' 'laundry_options'
 'parking_options' 'image_url' 'description' 'lat' 'long' 'state']


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265190 entries, 0 to 265189
Data columns (total 22 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       265190 non-null  int64  
 1   url                      265190 non-null  object 
 2   region                   265190 non-null  object 
 3   region_url               265190 non-null  object 
 4   price                    265190 non-null  int64  
 5   type                     265190 non-null  object 
 6   sqfeet                   265190 non-null  int64  
 7   beds                     265190 non-null  int64  
 8   baths                    265190 non-null  float64
 9   cats_allowed             265190 non-null  int64  
 10  dogs_allowed             265190 non-null  int64  
 11  smoking_allowed          265190 non-null  int64  
 12  wheelchair_access        265190 non-null  int64  
 13  electric_vehicle_charge  265190 non-null  int64  
 14  come

# From above info we can observe that, some feature values are missing like in description ,lat,long etc.

Since price is discrete variable it is regression problem

In [None]:
# since url feature are mostly useless we will drop the url containing features
df.drop(['url','region_url','image_url'],axis=1,inplace=True)
df.head(1)

Unnamed: 0,id,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,description,lat,long,state
0,7039061606,birmingham,1195,apartment,1908,3,2.0,1,1,1,0,0,0,laundry on site,street parking,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al


In [None]:
# checking the duplicate data
df[df.duplicated(['id'])]

Unnamed: 0,id,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,description,lat,long,state


In [None]:
# here it doesnot contain any duplicated data so we will drop the id column as well
df.drop('id',axis=1,inplace=True)

In [None]:
# printing columns containg null values along with number of null values it contains
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

laundry_options    54311
parking_options    95135
description            2
lat                 1419
long                1419
state                  1
dtype: int64

Here we can see, laundry options and parking options has many missing value and state and description has small number of missing values



#Filling missing values


Filling lat and long feature using mean lat long on the region basis and assigning this value to the missing value

In [None]:
region_lat_long_mean = df.groupby('region')['lat','long'].mean()
region_lat_long_mean.head(2)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,lat,long
region,Unnamed: 1_level_1,Unnamed: 2_level_1
SF bay area,37.653862,-122.139984
akron / canton,41.095203,-81.514517


In [None]:
# Filling the mean lat and long values
lat_long_miss_ind = df[df['lat'].isnull()].index
for ind in lat_long_miss_ind:
  reg = df.loc[ind]['region']
  lat = region_lat_long_mean.loc[reg]['lat']
  long = region_lat_long_mean.loc[reg]['long']
  df.loc[ind,'lat'] = lat
  df.loc[ind,'long'] = long

In state column there was only one data missing and looking its description we found its state was ohio . So we manually replace it with 'oh'

In [None]:
df.at[265189,'state'] = 'oh'

We can two missing description feature and since we cannot add description to it we will remove these two rows.

In [None]:
# Checking the feature with with their corresponding null values
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

laundry_options    54311
parking_options    95135
description            2
dtype: int64

Since State and Region features have many categories and turning them into one hot vectors results in high dimensional sparse vectors. So we will convert the state and region features into numerical Price features by caculating average of price of particular state and region. 

In [None]:
region_median = df.groupby('region',as_index=False)['price'].median()
state_median = df.groupby('state',as_index=False)['price'].median()

#renaming the price column as it will be same while merging to same dataframe
region_median = region_median.rename(columns={'price':'region_median'})
state_median = state_median.rename(columns={'price':'state_median'})

print(region_median[region_median['region']=='birmingham'])

        region  region_median
21  birmingham          925.0


In [None]:
# Merging the two new features of medain_region_price and median_state_price and dropping state and region column
new = df.merge(region_median,on='region',how='left')
df = new.merge(state_median,on='state',how='left')
df.drop(['region','state'],axis=1,inplace=True)
df.head(1)

Unnamed: 0,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,description,lat,long,region_median,state_median
0,1195,apartment,1908,3,2.0,1,1,1,0,0,0,laundry on site,street parking,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,925.0,820.0


In [None]:
# Storing the dataframe copy. It will be used in the last cells of this notebook
original_df = df.copy()
original_df.drop('description',axis=1,inplace=True)

In [None]:
print(original_df.shape)

(265190, 17)


In [None]:
#scaling the numerical data using standarization
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
columns_to_scale = ['sqfeet', 'lat', 'long', 'region_median', 'state_median']
df[columns_to_scale] = standardScaler.fit_transform(df[columns_to_scale])

In [None]:
#Converting type column into one hot vector 
df = pd.get_dummies(df,columns=['type'])
df.head(1)

Unnamed: 0,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,description,lat,long,region_median,state_median,type_apartment,type_assisted living,type_condo,type_cottage/cabin,type_duplex,type_flat,type_house,type_in-law,type_land,type_loft,type_manufactured,type_townhouse
0,1195,0.0353,3,2.0,1,1,1,0,0,0,laundry on site,street parking,Apartments In Birmingham AL Welcome to 100 Inv...,-0.668731,0.327312,-0.54759,-0.979522,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df.shape

(265190, 29)

#For Filling Laundary and Parking Options

In [None]:
print("The percentage of missing values in laundry options:{:.2f} %".format((df[df['laundry_options'].isnull()].shape[0]/df.shape[0])*100))

The percentage of missing values in laundry options:20.48 %


In [None]:
print("The percentage of missing values in parking options:{:.2f} %".format((df[df['parking_options'].isnull()].shape[0]/df.shape[0])*100))

The percentage of missing values in parking options:35.87 %


# Using machine learning to fill the null values in Laundry_options

In [None]:
#Splitting into train and test set
df_new = df.copy()
df_new.drop(['description','parking_options'],axis=1,inplace=True)
test_laundry = df_new[df_new['laundry_options'].isnull()] # keeping null values rows of laundry_options into test set
train_laundry = df_new.drop(test_laundry.index,axis=0) 

In [None]:
train_laundry.head(2)

Unnamed: 0,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,lat,long,region_median,state_median,type_apartment,type_assisted living,type_condo,type_cottage/cabin,type_duplex,type_flat,type_house,type_in-law,type_land,type_loft,type_manufactured,type_townhouse
0,1195,0.0353,3,2.0,1,1,1,0,0,0,laundry on site,-0.668731,0.327312,-0.54759,-0.979522,1,0,0,0,0,0,0,0,0,0,0,0
1,1120,0.009767,3,2.0,1,1,1,0,0,0,laundry on site,-0.677046,0.321669,-0.54759,-0.979522,1,0,0,0,0,0,0,0,0,0,0,0


# Using machine learning to predict the missing values of Laundry options and parking options.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,r2_score,accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.metrics import mean_squared_error
import time
from collections import Counter
import pickle

In [None]:
#splitting train_laundry data
X_train,X_test,y_train,y_test = train_test_split(train_laundry.drop('laundry_options',axis=1),train_laundry['laundry_options'],test_size=0.2)

# Using KNN For Laundry Options

In [None]:
import time


start = time.time()
knn = KNeighborsClassifier()
parameters = {'n_neighbors':[1,3,5,9,10]}
clf = GridSearchCV(knn, parameters)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
# plot_confusion_matrix(y_test,y_pred)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
end = time.time()
print("Total execution time is : ",(end-start)/60)

                    precision    recall  f1-score   support

   laundry in bldg       0.74      0.74      0.74      5493
   laundry on site       0.80      0.82      0.81      7784
no laundry on site       0.44      0.39      0.41       515
       w/d hookups       0.82      0.82      0.82     10070
       w/d in unit       0.86      0.85      0.86     18314

          accuracy                           0.82     42176
         macro avg       0.73      0.72      0.73     42176
      weighted avg       0.82      0.82      0.82     42176

[[ 4080   416    75   235   687]
 [  414  6351    70   315   634]
 [   84    84   201    61    85]
 [  247   390    51  8292  1090]
 [  719   731    65  1209 15590]]
Total execution time is :  2.088326994578044


In [None]:
print("Best score::",clf.best_score_)
print("Best estimator::",clf.best_estimator_)
print("Best paramater::",clf.best_params_)

Best score:: 0.8086222547444516
Best estimator:: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Best paramater:: {'n_neighbors': 1}


In [None]:
laundry_pred_knn = clf.predict(test_laundry.drop('laundry_options',axis=1))
Counter(laundry_pred_knn )

Counter({'laundry in bldg': 6137,
         'laundry on site': 9036,
         'no laundry on site': 773,
         'w/d hookups': 13962,
         'w/d in unit': 24403})

In [None]:
test_laundry['laundry_options'] = laundry_pred_knn
test_laundry

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,lat,long,region_median,state_median,type_apartment,type_assisted living,type_condo,type_cottage/cabin,type_duplex,type_flat,type_house,type_in-law,type_land,type_loft,type_manufactured,type_townhouse
10,890,-0.001893,2,2.0,0,0,1,0,0,0,w/d in unit,-0.666966,0.327369,-0.547590,-0.979522,1,0,0,0,0,0,0,0,0,0,0,0
49,1250,0.010591,3,2.0,0,0,1,0,0,0,w/d hookups,-0.680877,0.320466,-0.547590,-0.979522,1,0,0,0,0,0,0,0,0,0,0,0
62,1260,0.178046,3,2.0,1,1,1,0,0,0,laundry on site,-0.669402,0.322694,-0.547590,-0.979522,1,0,0,0,0,0,0,0,0,0,0,0
64,850,0.002181,3,1.0,0,0,1,0,0,0,w/d hookups,-0.628711,0.327358,-0.547590,-0.979522,0,0,0,0,0,0,1,0,0,0,0,0
65,1050,0.009984,4,2.0,0,0,1,0,0,0,w/d in unit,-0.628711,0.327358,-0.547590,-0.979522,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265157,855,-0.006965,2,1.0,0,0,1,0,0,0,laundry in bldg,0.467346,0.547201,-0.991098,-0.807691,1,0,0,0,0,0,0,0,0,0,0,0
265161,1155,0.001791,2,2.0,0,0,1,0,0,0,w/d in unit,0.504065,0.536078,-0.991098,-0.807691,1,0,0,0,0,0,0,0,0,0,0,0
265163,1060,-0.006272,2,2.0,0,0,1,0,0,0,laundry in bldg,0.512627,0.545859,-0.991098,-0.807691,1,0,0,0,0,0,0,0,0,0,0,0
265174,654,-0.021834,1,1.0,0,0,1,0,0,0,w/d hookups,0.511285,0.542048,-0.991098,-0.807691,1,0,0,0,0,0,0,0,0,0,0,0


# For Parking Options

In [None]:
df_new2 = df.copy()
df_new2.drop(['description','laundry_options'],axis=1,inplace=True)
test_parking = df_new2[df_new2['parking_options'].isnull()] # keeping null values rows of parking_options into test set
train_parking = df_new2.drop(test_parking.index,axis=0) 

In [None]:
#splitting train_laundry data
X_train,X_test,y_train,y_test = train_test_split(train_parking.drop('parking_options',axis=1),train_parking['parking_options'],test_size=0.2)

In [None]:
start = time.time()
knn = KNeighborsClassifier()
parameters = {'n_neighbors':[1,3,5,9,10]}
clf2 = GridSearchCV(knn, parameters)
clf2.fit(X_train,y_train)
y_pred = clf2.predict(X_test)
# plot_confusion_matrix(y_test,y_pred)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
end = time.time()
print("Total execution time is : ",(end-start)/60)

                    precision    recall  f1-score   support

   attached garage       0.74      0.72      0.73      5568
           carport       0.81      0.84      0.82      5732
   detached garage       0.63      0.66      0.65      2541
        no parking       0.68      0.65      0.67       409
off-street parking       0.88      0.87      0.88     17674
    street parking       0.61      0.61      0.61      2065
     valet parking       0.60      0.55      0.57        22

          accuracy                           0.81     34011
         macro avg       0.71      0.70      0.70     34011
      weighted avg       0.81      0.81      0.81     34011

[[ 4019   301   371    19   720   134     4]
 [  263  4808   163    10   437    50     1]
 [  278   170  1679     8   359    47     0]
 [   27    10     8   265    69    30     0]
 [  724   568   365    67 15406   541     3]
 [  121    89    61    18   509  1267     0]
 [    6     1     0     0     2     1    12]]
Total execution time 

In [None]:
print("Best score::",clf.best_score_)
print("Best estimator::",clf.best_estimator_)
print("Best paramater::",clf.best_params_)

Best score:: 0.8086222547444516
Best estimator:: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
Best paramater:: {'n_neighbors': 1}


In [None]:
parking_pred_knn = clf2.predict(test_parking.drop('parking_options',axis=1))
Counter(parking_pred_knn )

Counter({'attached garage': 11811,
         'carport': 12336,
         'detached garage': 6858,
         'no parking': 2147,
         'off-street parking': 54860,
         'street parking': 7065,
         'valet parking': 58})

In [None]:
test_parking['parking_options'] = parking_pred_knn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:

# Index for final test set
test_rows_index  = test_laundry.index.union(test_parking.index)

# Filling the null values with our predicted values in the dataframe

In [None]:
df.iloc[list(test_parking.index),11]   = parking_pred_knn

In [None]:
df.iloc[list(test_laundry.index),10]   = laundry_pred_knn

In [None]:
df.shape

(265190, 29)

# Since we have not done full analysis and removes outliers we will just use the unscaled and categorical text data

In [None]:
df.columns

Index(['price', 'sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed',
       'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge',
       'comes_furnished', 'laundry_options', 'parking_options', 'description',
       'lat', 'long', 'region_median', 'state_median', 'type_apartment',
       'type_assisted living', 'type_condo', 'type_cottage/cabin',
       'type_duplex', 'type_flat', 'type_house', 'type_in-law', 'type_land',
       'type_loft', 'type_manufactured', 'type_townhouse'],
      dtype='object')

In [None]:
# adding the missing values in the original_df(not scaled and have categorical text data)
original_df['laundry_options'] = df['laundry_options']
original_df['parking_options'] = df['parking_options']

In [None]:
original_df

Unnamed: 0,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,lat,long,region_median,state_median
0,1195,apartment,1908,3,2.0,1,1,1,0,0,0,laundry on site,street parking,33.422600,-86.706500,925.0,820.0
1,1120,apartment,1319,3,2.0,1,1,1,0,0,0,laundry on site,off-street parking,33.375500,-86.804500,925.0,820.0
2,825,apartment,1133,1,1.5,1,1,1,0,0,0,laundry on site,street parking,33.422600,-86.706500,925.0,820.0
3,800,apartment,927,1,1.0,1,1,1,0,0,0,laundry on site,street parking,33.422600,-86.706500,925.0,820.0
4,785,apartment,1047,2,1.0,1,1,1,0,0,0,laundry on site,street parking,33.422600,-86.706500,925.0,820.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265185,0,apartment,1061,2,2.0,1,1,1,0,0,0,w/d in unit,detached garage,40.049500,-83.066900,738.0,875.0
265186,1069,apartment,1020,2,1.5,1,1,1,0,0,0,w/d hookups,detached garage,39.840800,-83.080400,738.0,875.0
265187,1507,apartment,1660,2,1.5,1,1,1,0,0,0,w/d in unit,detached garage,40.056400,-83.041700,738.0,875.0
265188,1001,apartment,1220,3,1.5,1,1,1,0,0,0,w/d hookups,off-street parking,40.045100,-82.456400,738.0,875.0


In [None]:
#saving the dataframe
original_df.to_csv('processed_data.csv',index=False)