In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#split training and test
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import OneHotEncoder

In [26]:
data = pd.read_csv("data.csv")
data.info()
if not isinstance(data, pd.DataFrame):
    raise TypeError("Expected 'data' to be a pandas DataFrame")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [28]:
# Setting up fields
columns = data.columns.tolist()
features = np.array([col for col in columns if col != 'price'])

# Splitting data
x_features = data[features]
x_target = data['price']


# Handle missing values
x_features = x_features.ffill()
x_target = x_target.fillna(x_target.mean())

# Identify categorical columns
categorical_cols = x_features.select_dtypes(include=['object', 'category']).columns.tolist()

# Identify date columns and convert them to numerical
date_cols = x_features.select_dtypes(include=['datetime', 'datetime64']).columns.tolist()
for date_col in date_cols:
    x_features[date_col] = pd.to_datetime(x_features[date_col]).astype(int) / 10**9  # Convert to seconds since epoch

# Apply OneHotEncoder to categorical columns
ohe = OneHotEncoder(sparse_output=False, drop='first')  # Dropping the first category to avoid multicollinearity
encoded_categorical = ohe.fit_transform(x_features[categorical_cols])

# Convert encoded columns to DataFrame and set proper column names
encoded_categorical_df = pd.DataFrame(encoded_categorical, columns=ohe.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate the encoded columns
x_features = x_features.drop(columns=categorical_cols)
x_features = pd.concat([x_features, encoded_categorical_df], axis=1)

In [29]:
#Split training from test
x_train,x_test,y_train,y_test = tts(x_features,x_target , test_size=0.3,random_state=5)

In [46]:
from sklearn.linear_model import LinearRegression 

# Train the model using LinearRegression
clf = LinearRegression()
clf.fit(x_train, y_train)

# Evaluate the model
accuracy = clf.score(x_test, y_test)
print("Linear Regression Accuracy: {} %".format(int(round(accuracy * 100))))

Linear Regression Accuracy: -16 %


In [47]:
from sklearn.linear_model import Ridge

# Train the model using Ridge regression
clf1 = Ridge()
clf1.fit(x_train, y_train)

# Evaluate the Ridge regression model
ridge_accuracy = clf1.score(x_test, y_test)
print("Ridge Regression Accuracy: {} %".format(int(round(ridge_accuracy * 100))))

Ridge Regression Accuracy: 62 %


In [48]:
from sklearn.ensemble import RandomForestRegressor

# Train the model using RandomForest regression
clf2 = RandomForestRegressor(random_state=5)
clf2.fit(x_train, y_train)

# Evaluate the RandomForest regression model
rf_accuracy = clf2.score(x_test, y_test)
print("Random Forest Regression Accuracy: {} %".format(int(round(rf_accuracy * 100))))

Random Forest Regression Accuracy: 66 %


In [49]:
from sklearn.tree import DecisionTreeRegressor

# Train the model using DecisionTree regression
clf4 = DecisionTreeRegressor(random_state=5)
clf4.fit(x_train, y_train)

# Evaluate the DecisionTree regression model
dt_accuracy = clf4.score(x_test, y_test)
print("Decision Tree Regression Accuracy: {} %".format(int(round(dt_accuracy * 100))))

Decision Tree Regression Accuracy: 46 %


In [50]:
from sklearn.neighbors import KNeighborsRegressor

# Train the model using KNeighbors regression
clf5 = KNeighborsRegressor()
clf5.fit(x_train, y_train)

# Evaluate the KNeighbors regression model
knn_accuracy = clf5.score(x_test, y_test)
print("KNeighbors Regression Accuracy: {} %".format(int(round(knn_accuracy * 100))))

KNeighbors Regression Accuracy: -16 %
