In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [3]:
def load_data(file_path):
    return pd.read_csv(file_path)

df = load_data("data.csv") 


In [4]:
print(f"Number of Rows: {df.shape[0]} \nNumber of Columns: {df.shape[1]}")
print("-- Data Overview --")



Number of Rows: 4600 
Number of Columns: 18
-- Data Overview --


In [5]:
print(df.head(2))

            date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  5/2/2014 0:00   313000.0       3.0        1.5       1340.0    7912.0   
1  5/2/2014 0:00  2384000.0       5.0        2.5       3650.0    9050.0   

   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
0     1.5         0.0   0.0          3      1340.0            0.0    1955.0   
1     2.0         0.0   4.0          5      3370.0          280.0    1921.0   

   yr_renovated                street       city  statezip country  
0          2005  18810 Densmore Ave N  Shoreline  WA 98133     USA  
1             0       709 W Blaine St    Seattle  WA 98119     USA  


In [6]:
print(df.tail(2))

                date     price  bedrooms  bathrooms  sqft_living  sqft_lot  \
4598  7/10/2014 0:00  203400.0       4.0        2.0       2090.0    6630.0   
4599  7/10/2014 0:00  220600.0       3.0        2.5       1490.0    8102.0   

      floors  waterfront  view  condition  sqft_above  sqft_basement  \
4598     1.0         0.0   0.0          3      1070.0         1020.0   
4599     2.0         0.0   0.0          4      1490.0            0.0   

      yr_built  yr_renovated             street       city  statezip country  
4598    1974.0             0  5148 S Creston St    Seattle  WA 98178     USA  
4599    1990.0             0  18717 SE 258th St  Covington  WA 98042     USA  


In [7]:
print(df.describe())

              price     bedrooms    bathrooms   sqft_living      sqft_lot  \
count  4.597000e+03  4595.000000  4598.000000   4598.000000  4.598000e+03   
mean   5.521190e+05     3.401088     2.160668   2139.498913  1.485607e+04   
std    5.639834e+05     0.908983     0.783920    963.387024  3.589177e+04   
min    0.000000e+00     0.000000     0.000000    370.000000  6.380000e+02   
25%    3.238333e+05     3.000000     1.750000   1460.000000  5.001000e+03   
50%    4.610000e+05     3.000000     2.250000   1980.000000  7.683000e+03   
75%    6.550000e+05     4.000000     2.500000   2620.000000  1.100375e+04   
max    2.659000e+07     9.000000     8.000000  13540.000000  1.074218e+06   

            floors   waterfront         view    condition   sqft_above  \
count  4598.000000  4595.000000  4598.000000  4600.000000  4597.000000   
mean      1.511853     0.007182     0.240757     3.451739  1827.509463   
std       0.537905     0.084449     0.778558     0.677230   862.138540   
min       

In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4597 non-null   float64
 2   bedrooms       4595 non-null   float64
 3   bathrooms      4598 non-null   float64
 4   sqft_living    4598 non-null   float64
 5   sqft_lot       4598 non-null   float64
 6   floors         4598 non-null   float64
 7   waterfront     4595 non-null   float64
 8   view           4598 non-null   float64
 9   condition      4600 non-null   int64  
 10  sqft_above     4597 non-null   float64
 11  sqft_basement  4597 non-null   float64
 12  yr_built       4594 non-null   float64
 13  yr_renovated   4600 non-null   int64  
 14  street         4597 non-null   object 
 15  city           4596 non-null   object 
 16  statezip       4597 non-null   object 
 17  country        4594 non-null   object 
dtypes: float

In [None]:
def fill_na_mode(columns):
    for col in columns:
        df[col] = df[col].fillna(df[col].mode()[0]) 

categorical_columns = ['street', 'city', 'statezip', 'country']
fill_na_mode(categorical_columns)

In [10]:
def fill_na_mean(columns):
    for col in columns:
        df[col] = df[col].fillna(df[col].mean())  

numerical_columns = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'yr_built']
fill_na_mode(numerical_columns)

In [11]:
float_columns = ['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']
fill_na_mean(float_columns)  

In [12]:
def convert_to_int(columns):
    for col in columns:
        df[col] = df[col].astype('int64')
int_columns = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'yr_built', 'price',
               'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']
convert_to_int(int_columns)

In [13]:
df = df.drop(columns=['street', 'country'], axis=1)

In [14]:
def encode_columns(columns):
    le = LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col])

encode_columns(['city', 'statezip'])

In [15]:
X = df.drop('price', axis=1) 
y = df['price'] 

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
def dataEncoder(cols):
    for i in cols:
        dataLabelEncoder = LabelEncoder()
        df[i] = dataLabelEncoder.fit_transform(df[i])

columns = ['city','statezip']
dataEncoder(columns)
df.to_csv(r'encoded-data.csv', index = False, header = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           4600 non-null   object
 1   price          4600 non-null   int64 
 2   bedrooms       4600 non-null   int64 
 3   bathrooms      4600 non-null   int64 
 4   sqft_living    4600 non-null   int64 
 5   sqft_lot       4600 non-null   int64 
 6   floors         4600 non-null   int64 
 7   waterfront     4600 non-null   int64 
 8   view           4600 non-null   int64 
 9   condition      4600 non-null   int64 
 10  sqft_above     4600 non-null   int64 
 11  sqft_basement  4600 non-null   int64 
 12  yr_built       4600 non-null   int64 
 13  yr_renovated   4600 non-null   int64 
 14  city           4600 non-null   int64 
 15  statezip       4600 non-null   int64 
dtypes: int64(15), object(1)
memory usage: 575.1+ KB


In [23]:
trainData, testData = train_test_split(df, test_size=0.2, shuffle=False)
trainData.shape
testData.shape

(920, 16)

In [25]:
train_x = trainData.iloc()[:, 2:]
test_x  = testData.iloc()[:, 2:]

In [26]:
train_y = trainData.iloc()[:, 2]
test_y  = testData.iloc()[:, 2]

In [27]:
train_x.head(2)
train_y.head(2)
test_x.head(2)
test_y.head(2)

3680    4
3681    3
Name: bedrooms, dtype: int64

In [28]:
model_svc = SVC()
model_svc.fit(train_x, train_y)

print(model_svc)

SVC()


In [30]:
pickle.dump(model_svc, open('model_svc.pkl', 'wb'))
model_svc = pickle.load(open('model_svc.pkl', 'rb'))
model_predictions = model_svc.predict(test_x)
model_accuracy_score = accuracy_score(test_y, model_predictions)

In [32]:
print(round(model_accuracy_score,3))
testdata_predict = testData.copy(deep=True)
pd.options.mode.chained_assignment = None

0.489


In [33]:
testdata_predict['Prediction'] = model_predictions
model_accuracy_score = accuracy_score(testdata_predict['price'], testdata_predict['Prediction'])

In [34]:
print("-- Model Accuracy Score: ", end='')
print(round(model_accuracy_score,3))

-- Model Accuracy Score: 0.0
