In [1]:
# Import the required libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm 
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets.samples_generator import make_blobs

In [2]:
#Load in South American Real Estate "For Sale" Listing Data Set and drop Unamed index columns to create data set 'df'
df1 = pd.read_csv('Imputed_Dataset.csv')
df1.drop(df1.columns[df1.columns.str.contains('Unnamed',case = False)],axis = 1, inplace = True)
df1.dropna()
df1=df1.reset_index(drop=True)
df1.info()
df1.dtypes
df1.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463724 entries, 0 to 463723
Data columns (total 20 columns):
lat                409210 non-null float64
lon                409210 non-null float64
rooms              463724 non-null float64
bedrooms           463724 non-null float64
bathrooms          463724 non-null float64
surface_total      463724 non-null float64
surface_covered    463724 non-null float64
price              463723 non-null float64
log_price          463724 non-null float64
id                 463723 non-null object
start_date         463723 non-null object
end_date           463723 non-null object
created_on         463723 non-null object
property_type      463723 non-null object
l1                 463723 non-null object
l2                 463723 non-null object
l3                 441216 non-null object
title              463723 non-null object
description        463717 non-null object
price_class        463723 non-null object
dtypes: float64(9), object(11)
memory us

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,log_price,id,start_date,end_date,created_on,property_type,l1,l2,l3,title,description,price_class
0,,,4.0,3.0,2.0,198.0,150.0,385000.0,12.860999,EWeY8jVhb8ielLeKtfibVw==,2019-09-15,2020-01-20,2019-09-15,Casa,Argentina,Bs.As. G.B.A. Zona Norte,Tigre,"Venta. Casa en una planta, 4 amb. con galería ...",MAM.(2) Hermosa propiedad en Barrio San Franci...,High
1,,,4.0,3.0,2.0,198.0,150.0,385000.0,12.860999,IrEeG8ewIcfLVGSb14kH1w==,2019-09-15,2020-01-20,2019-09-15,Casa,Argentina,Bs.As. G.B.A. Zona Norte,Tigre,Venta. Casa en una planta.Lote interno. Barrio...,MAM. Hermosa propiedad en Barrio San Francisco...,High
2,-27.371199,-55.898454,7.0,3.0,2.0,173.0,173.0,195000.0,12.180755,hPuiyAjuBI92uEQdFoY4Fw==,2019-09-15,1970-01-01,2019-09-15,Casa,Argentina,Misiones,Posadas,Casa - Posadas,Inmueble centrico ideal para Local comercial ...,Average
3,,,3.0,3.0,1.0,49.0,40.0,85000.0,11.350407,9vIWv494LqC6crAkfpLeDg==,2019-09-15,2020-03-03,2019-09-15,Casa,Argentina,Santa Fe,Rosario,Cabaña en la Isla sobre riacho Los Marinos - V...,Lote de 20 metros de frente por 130 metros de ...,Low
4,-32.951146,-60.571979,3.0,3.0,1.0,49.0,40.0,85000.0,11.350407,aydrtgoZ/frzXDcV20McuA==,2019-09-15,2020-03-03,2019-09-15,Casa,Argentina,Entre Ríos,Victoria,Cabaña en la Isla sobre riacho Los Marinos - V...,Lote de 20 metros de frente por 130 metros de ...,Low


In [3]:
# Check on property types: there are 12 unique property types, including missing values, which will be dropped from analysis.
df1.property_type.unique()

array(['Casa', 'Departamento', 'Local comercial', 'PH', 'Lote', 'Otro',
       'Garaje', 'Oficina', 'Depósito', 'Casa de campo', 'Finca',
       'Parqueadero', nan], dtype=object)

In [4]:
# Check on country types: there are 5 unique South American countries and missing vlaues, which will be excluded from analysis.
df1.l1.unique()

array(['Argentina', 'Uruguay', 'Colombia', 'Ecuador', 'Perú', nan],
      dtype=object)

In [5]:
# In dataset df, drop specific columns that is not needed in Linear Regression
# Create data set 'df1' without remaining attributes
df1 = df1.drop(['lat', 'lon', 'start_date','end_date', 'created_on','l2','l3', 'title','description','id','log_price'], axis=1)
df1.head(5)

Unnamed: 0,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,property_type,l1,price_class
0,4.0,3.0,2.0,198.0,150.0,385000.0,Casa,Argentina,High
1,4.0,3.0,2.0,198.0,150.0,385000.0,Casa,Argentina,High
2,7.0,3.0,2.0,173.0,173.0,195000.0,Casa,Argentina,Average
3,3.0,3.0,1.0,49.0,40.0,85000.0,Casa,Argentina,Low
4,3.0,3.0,1.0,49.0,40.0,85000.0,Casa,Argentina,Low


In [6]:
#Defining variable data types process data for SVM modeling
string_vars = ['price_class','property_type','l1']
df1[string_vars] = df1[string_vars].astype(str)

In [7]:
# Get Dummies Variables process property type values for SVM modeling
dummies = pd.get_dummies(df1.property_type)
dummies

Unnamed: 0,Casa,Casa de campo,Departamento,Depósito,Finca,Garaje,Local comercial,Lote,Oficina,Otro,PH,Parqueadero,nan
0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
463719,0,0,1,0,0,0,0,0,0,0,0,0,0
463720,0,0,0,0,0,0,0,0,0,1,0,0,0
463721,0,0,1,0,0,0,0,0,0,0,0,0,0
463722,0,0,1,0,0,0,0,0,0,0,0,0,0


In [8]:
# Join dummies dataset with original dataset to create a combined data set, 'mergedf1'
mergedf1 = pd.concat([df1,dummies], axis = 'columns')
mergedf1 = mergedf1.drop(['property_type'], axis = 1)
mergedf1

Unnamed: 0,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,l1,price_class,Casa,Casa de campo,...,Depósito,Finca,Garaje,Local comercial,Lote,Oficina,Otro,PH,Parqueadero,nan
0,4.0,3.0,2.0,198.0,150.0,385000.0,Argentina,High,1,0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,3.0,2.0,198.0,150.0,385000.0,Argentina,High,1,0,...,0,0,0,0,0,0,0,0,0,0
2,7.0,3.0,2.0,173.0,173.0,195000.0,Argentina,Average,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,3.0,1.0,49.0,40.0,85000.0,Argentina,Low,1,0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,3.0,1.0,49.0,40.0,85000.0,Argentina,Low,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463719,2.0,2.0,1.0,63.0,62.0,210000.0,Uruguay,High,0,0,...,0,0,0,0,0,0,0,0,0,0
463720,4.0,2.0,2.0,255.0,128.0,150000.0,Uruguay,Low,0,0,...,0,0,0,0,0,0,1,0,0,0
463721,4.0,3.0,2.0,114.0,114.0,98500.0,Uruguay,Average,0,0,...,0,0,0,0,0,0,0,0,0,0
463722,2.0,2.0,1.0,54.0,45.0,105000.0,Uruguay,Average,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Create dummy variable for country values and process data for SVM modeling
dummies2 = pd.get_dummies(mergedf1.l1)
dummies2

Unnamed: 0,Argentina,Colombia,Ecuador,Perú,Uruguay,nan
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
463719,0,0,0,0,1,0
463720,0,0,0,0,1,0
463721,0,0,0,0,1,0
463722,0,0,0,0,1,0


In [10]:
# Join dummies dataset with original dataset, delete unnecessary variables l1 and price to create data set 'mergedf2'
mergedf2 = pd.concat([mergedf1,dummies2], axis = 'columns')
mergedf2 = mergedf2.drop(['l1','price'], axis = 1)
mergedf2 = mergedf2.dropna()
mergedf2

Unnamed: 0,rooms,bedrooms,bathrooms,surface_total,surface_covered,price_class,Casa,Casa de campo,Departamento,Depósito,...,Otro,PH,Parqueadero,nan,Argentina,Colombia,Ecuador,Perú,Uruguay,nan.1
0,4.0,3.0,2.0,198.0,150.0,High,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,4.0,3.0,2.0,198.0,150.0,High,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,7.0,3.0,2.0,173.0,173.0,Average,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,3.0,3.0,1.0,49.0,40.0,Low,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,3.0,3.0,1.0,49.0,40.0,Low,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463719,2.0,2.0,1.0,63.0,62.0,High,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
463720,4.0,2.0,2.0,255.0,128.0,Low,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
463721,4.0,3.0,2.0,114.0,114.0,Average,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
463722,2.0,2.0,1.0,54.0,45.0,Average,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
# Create random sample of 31,000 rows for data set mergedf2
mergedf2 = mergedf2.sample(n=31000,random_state=6)

In [12]:
# Assign arrays test SVM models using data set mergedf2
X= mergedf2.drop(['price_class'], axis = 1)
X=np.asarray(X)
y=np.asarray(mergedf2['price_class'])
X.shape

(31000, 24)

In [13]:
#Split data into 80:20 train and test data sets, and normalize the data for faster SVM model processing time
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=6)
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

In [31]:
# Build SVM Linear model to classify price_class (Low, Average, High) using attributes:  total room, bedrooms, bathrooms,surface total
# surface covered, and property types
from sklearn.svm import SVC
classifier = svm.SVC(kernel='linear', gamma = 'auto', C=2)
classifier.fit(X_train, y_train)
y_predict=classifier.predict(X_test)

In [32]:
print (classification_report(y_test, y_predict))

              precision    recall  f1-score   support

     Average       0.40      0.60      0.48      2098
        High       0.56      0.49      0.52      2095
         Low       0.52      0.32      0.39      2007

    accuracy                           0.47      6200
   macro avg       0.49      0.47      0.47      6200
weighted avg       0.49      0.47      0.47      6200



In [34]:
# Build SVM 'rbf' model to classify price_class (Low, Average, High) using attributes: total rooms, bedrooms, bathrooms, surface total,
# surface covered, and property types.
from sklearn.svm import SVC
classifier2 = svm.SVC(kernel ='rbf', gamma = 'auto', C=2)
classifier2.fit(X_train,y_train)
y_predict2=classifier2.predict(X_test)
print (classification_report(y_test, y_predict2))

              precision    recall  f1-score   support

     Average       0.42      0.54      0.47      2098
        High       0.59      0.49      0.54      2095
         Low       0.49      0.43      0.46      2007

    accuracy                           0.49      6200
   macro avg       0.50      0.49      0.49      6200
weighted avg       0.50      0.49      0.49      6200



In [15]:
# Build SVM 'sigmoid' model to classify price_class (Low, Average, High) using attributes: total rooms, bedrooms, bathrooms, surface total,
# surface covered, and property types.
from sklearn.svm import SVC
classifier3 = svm.SVC(kernel ='sigmoid', gamma='auto', C= 2)
classifier3.fit(X_train,y_train), 
y_predict3=classifier3.predict(X_test)
print (classification_report(y_test, y_predict3))

              precision    recall  f1-score   support

     Average       0.31      0.14      0.19      2098
        High       0.43      0.45      0.44      2095
         Low       0.37      0.56      0.44      2007

    accuracy                           0.38      6200
   macro avg       0.37      0.38      0.36      6200
weighted avg       0.37      0.38      0.36      6200



In [16]:
# Build SVM 'poly' model to classify price_class(Low, Average, High) using attributes: total rooms, bedrooms, bathrooms, surface total,
# surface covered, and property types.
from sklearn.svm import SVC
classifier4 = svm.SVC(kernel ='poly', degree = 2, gamma='auto', C= 2)
classifier4.fit(X_train,y_train), 
y_predict4=classifier4.predict(X_test)
print (classification_report(y_test, y_predict4))

              precision    recall  f1-score   support

     Average       0.40      0.63      0.49      2098
        High       0.57      0.33      0.42      2095
         Low       0.47      0.38      0.42      2007

    accuracy                           0.45      6200
   macro avg       0.48      0.45      0.44      6200
weighted avg       0.48      0.45      0.44      6200

