In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train.columns

Index(['address', 'sold_price', 'summary', 'type', 'year_built', 'heating',
       'cooling', 'parking', 'bedrooms', 'bathrooms', 'full_bathrooms',
       'total_interior_livable_area', 'total_spaces', 'garage_spaces',
       'region', 'elementary_school', 'elementary_school_score',
       'elementary_school_distance', 'middle_school', 'middle_school_score',
       'middle_school_distance', 'high_school', 'high_school_score',
       'high_school_distance', 'flooring', 'heating_features',
       'cooling_features', 'appliances_included', 'laundry_features',
       'parking_features', 'tax_assessed_value', 'annual_tax_amount',
       'listed_on', 'listed_price', 'last_sold_on', 'last_sold_price', 'city',
       'zip', 'state', 'id'],
      dtype='object')

In [4]:
numeric_cols = [
    'bathrooms', 'full_bathrooms', 'total_interior_livable_area', 'total_spaces', 'garage_spaces', 
    'elementary_school_score', 'elementary_school_distance', 'middle_school_score', 'middle_school_distance', 
    'high_school_score', 'high_school_distance', 'tax_assessed_value', 'listed_price', 
    'last_sold_price', 'year_built', 'annual_tax_amount'
]

target_cols = ['sold_price']
id_cols = ['id']

Start with numeric columns only:

In [5]:
train_num_df = df_train[numeric_cols].fillna(df_train[numeric_cols].mean(axis=0))
test_num_df = df_test[numeric_cols].fillna(df_test[numeric_cols].mean(axis=0))

In [6]:
X_train, Y_train = train_num_df.values, df_train[target_cols].values
X_test = test_num_df.values

Scale to succeed:

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

One-hot encode cities:

In [9]:
cities_train = df_train[["city"]].astype(str)
cities_test = df_test[["city"]].astype(str)
all_cities = pd.concat([cities_train, cities_test])

In [10]:
ohe_cities = pd.get_dummies(all_cities[["city"]])
train_ohe_cities = ohe_cities[:len(cities_train)]
test_ohe_cities = ohe_cities[len(cities_train):]

In [11]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [12]:
X_train_city = np.hstack([X_train, train_ohe_cities.values])
X_test_city = np.hstack([X_test, test_ohe_cities.values])

Let's pick a linear model:

In [13]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [14]:
lr_model = Lasso(alpha=1e-5, max_iter=1e4)
print(-np.mean(cross_val_score(lr_model, X_train_city, Y_train, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)))

0.25262041470910473


In [15]:
lr_model.fit(X_train_city, Y_train)

Y_train_pred = lr_model.predict(X_train_city)
np.sqrt(mean_squared_error(Y_train, Y_train_pred))

0.2390567183136403

In [16]:
Y_pred = lr_model.predict(X_test_city)

In [18]:
df_submit = df_test[["id"]].copy()
df_submit.loc[:, "sold_price"] = Y_pred
df_submit.to_csv("sample_submission.csv", index=False)