In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None) # Show all columns
#pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None) # Show full width of column content

df = pd.read_csv("../data/processed/clean_house_data.csv")
df.head()


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country,sale_year,sale_month
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA,2014,5
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA,2014,5
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA,2014,5
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA,2014,5
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA,2014,5


In [None]:
from sklearn.preprocessing import LabelEncoder

le_city = LabelEncoder()
df['city_encoded'] = le_city.fit_transform(df['city'])

# Optional: encode statezip too
le_state = LabelEncoder()
df['country_encoded'] = le_state.fit_transform(df['country'])

# Drop original categorical columns
#df = df.drop(columns=['city', 'country' , 'statezip' ,'street'])


In [7]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country,sale_year,sale_month,city_encoded,country_encoded
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA,2014,5,36,0
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA,2014,5,35,0
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA,2014,5,18,0
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA,2014,5,3,0
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA,2014,5,31,0


In [8]:
X = df.drop(columns=['city', 'country' , 'statezip' ,'street'])

y = df['price']

X.shape, y.shape

((4600, 17), (4600,))

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   price            4600 non-null   float64
 1   bedrooms         4600 non-null   float64
 2   bathrooms        4600 non-null   float64
 3   sqft_living      4600 non-null   int64  
 4   sqft_lot         4600 non-null   int64  
 5   floors           4600 non-null   float64
 6   waterfront       4600 non-null   int64  
 7   view             4600 non-null   int64  
 8   condition        4600 non-null   int64  
 9   sqft_above       4600 non-null   int64  
 10  sqft_basement    4600 non-null   int64  
 11  yr_built         4600 non-null   int64  
 12  yr_renovated     4600 non-null   int64  
 13  sale_year        4600 non-null   int64  
 14  sale_month       4600 non-null   int64  
 15  city_encoded     4600 non-null   int64  
 16  country_encoded  4600 non-null   int64  
dtypes: float64(4),

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3680, 17), (920, 17), (3680,), (920,))

In [15]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score




models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    results.append([name, train_r2, test_r2, test_mae])

results_df = pd.DataFrame(
    results,
    columns=["Model", "Train R2", "Test R2", "Test MAE"]
)

results_df

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.