In [41]:
import pandas as pd
df = pd.read_csv("outputs/final_df.csv", low_memory=False)

In [42]:
df['calendar_updated'] = pd.to_datetime(df['calendar_updated'])
df['first_review'] = pd.to_datetime(df['first_review'])
df['last_review'] = pd.to_datetime(df['last_review'])
df['host_since'] = pd.to_datetime(df['host_since'])

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from joblib import dump, load
from lightgbm import LGBMRegressor
import torch
import torchvision.transforms as transforms
from image_reg import abb_dataset, Net

desc_tfidf = TfidfVectorizer(stop_words='english')
neigh_tfidf = TfidfVectorizer(stop_words='english')
host_tfidf = TfidfVectorizer(stop_words='english')

X = df.drop(['price', 'id', 'listing_url', 'scrape_id',
             'last_scraped',
             'picture_url', 'host_id', 'host_url', 'host_name',
             'host_thumbnail_url', 'host_picture_url', 'calendar_last_scraped',
             ], axis=1)
X['calendar_updated'] = pd.to_numeric(pd.to_datetime(X['calendar_updated']))
X['host_since'] = pd.to_numeric(pd.to_datetime(X['host_since']))
X['first_review'] = pd.to_numeric(pd.to_datetime(X['first_review']))
X['last_review'] = pd.to_numeric(pd.to_datetime(X['last_review']))

y = df['price']
y = y.str.replace('$', '', regex=False).str.replace('.', '', regex=False).str.replace(',','', regex=False).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

desc_vec = desc_tfidf.fit_transform(X_train['description'].replace(np.nan, ' '))
print('Transformed train description')
desc_vec_test = desc_tfidf.transform(X_test['description'].replace(np.nan, ' '))
print('Transformed test description')
desc_reg = LGBMRegressor(random_state=0).fit(desc_vec, y_train)
X_train['desc_pred'] = desc_reg.predict(desc_vec)
X_test['desc_pred'] = desc_reg.predict(desc_vec_test)
print('Trained desc regressor')

neigh_vec = neigh_tfidf.fit_transform(X_train['neighborhood_overview'].replace(np.nan, ' '))
print('Transformed train neighborhood')
neigh_vec_test = neigh_tfidf.transform(X_test['neighborhood_overview'].replace(np.nan, ' '))
print('Transformed test neighborhood')
neigh_reg = LGBMRegressor(random_state=0).fit(neigh_vec, y_train)
X_train['neigh_pred'] = neigh_reg.predict(neigh_vec)
X_test['neigh_pred'] = neigh_reg.predict(neigh_vec_test)
print('Trained neigh regressor')

host_vec = host_tfidf.fit_transform(X_train['host_about'].replace(np.nan, ' '))
print('Transformed train host')
host_vec_test = host_tfidf.transform(X_test['host_about'].replace(np.nan, ' '))
print('Transformed test host')
host_reg = LGBMRegressor(random_state=0).fit(host_vec, y_train)
X_train['host_pred'] = host_reg.predict(host_vec)
X_test['host_pred'] = host_reg.predict(host_vec_test)
print('Trained host regressor')

net = Net()
net.load_state_dict(torch.load('../models/cnn'))

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

trainset = abb_dataset(csv_file='data/faces/face_landmarks.csv',
                        root='./outputs', train=True,
                        download=True, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = abb_dataset(csv_file='data/faces/face_landmarks.csv',
                    root='./outputs', train=False,
                    download=True, transform=transform)

testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)
X_train['host_pred'] = net(trainloader)
X_test['host_pred'] = net(testloader)
_, predicted = torch.max(outputs, 1)

X_train.drop(['description', 'neighborhood_overview', 'host_about'], axis=1, inplace=True)

X_test.drop(['description', 'neighborhood_overview', 'host_about'], axis=1, inplace=True)

enc = OneHotEncoder(handle_unknown='ignore')

X_train = enc.fit_transform(X_train)
X_test = enc.transform(X_test)

print('Encoded X')

reg = LGBMRegressor(random_state=0)
reg.fit(X_train, y_train)
print('Trained final model')

Transformed train description
Transformed test description
Trained desc regressor
Transformed train neighborhood
Transformed test neighborhood
Trained neigh regressor
Transformed train host
Transformed test host
Trained host regressor
Encoded X
Trained final model


In [47]:
dump(desc_reg, 'models/desc_reg.joblib')
dump(neigh_reg, 'models/neigh_reg.joblib')
dump(host_reg, 'models/host_reg.joblib')
dump(enc, 'models/enc.joblib')
dump(reg, 'models/final_reg.joblib')

['models/final_reg.joblib']

In [45]:
reg.score(X_test, y_test)

0.24254563551258135