# Импортируем библиотеки

In [None]:
import pandas as pd
import re

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.backend import clear_session

np.random.seed(42)

# Предобработка данных

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.head()

In [None]:
df.shape

Количество пропущенных значенний:

In [None]:
df.isna().sum() / (df.shape[0])*100 #процент пропусков

Удаляем строки без целевой переменной:

In [None]:
df = df.loc[df['target'].isna() == False]

In [None]:
df.shape

Очищаем целевую переменную:

In [None]:
def target_format(target):
    target = re.sub('[^0-9]', '', target) #оставляем только цифры
    target = int(target)
    return target

In [None]:
df['target'] = df['target'].apply(target_format)
df['target'].describe()

Очищаем признаки:

In [None]:
def features_float_format(feature):
    if feature == 0:
        return feature
    feature = re.sub('1 1/2', '1.5', feature) # заменяем 1 1/2 на 1.5 (признак stories)
    feature = re.sub('[^0-9,\.]', '', feature) # оставляем только цифры
    feature = re.sub(',', '.', feature) # заменяем ',' на '.'
    
    try:
        feature = float(feature)
    except:
        feature = 0
    return feature

def status_format(status):
    status = status.lower()
    # если дом ещё не сделан,оставляем только 'coming soon', дату удаляем
    if status.startswith('coming soon'): 
        status = 'coming soon'
        
    status = re.sub('[^a-z]', ' ', status) # оставляем только буквы
    status = re.sub(r'\b\w{,2}\b', '', status) # удаляем сочетание из 1 и 2 букв
    status = re.sub(r'\s+', ' ', status) # заменяем 1 или более пробелов на ' '
    return status

def fireplace_format(fireplace):
    if fireplace == 0:
        return fireplace
    fireplace = fireplace.lower()
    
    if fireplace.count('no')>0:
        fireplace = 0
    else:
        fireplace = 1
    return fireplace

In [None]:
dummy_features = ['status','state'] 
drop_features = ['street', 'mls-id', 'MlsId', 'schools', 'homeFacts', 'city', 'zipcode'] 

In [None]:
df['status'] = df['status'].fillna('')
df['status'] = df['status'].apply(status_format)

In [None]:
df.propertyType.value_counts()

Приблизительно 50% домов - 'single family', поэтому оставляем только этот признак:

In [None]:
def propertyType_format(propertyType):
    propertyType = propertyType.lower()
    propertyType = re.sub('[^a-z]', ' ', propertyType)# оставляем только буквы
    
    if propertyType.startswith('single family'):
        propertyType = 1 
    else:
        propertyType = 0
    return propertyType

In [None]:
df['propertyType'] = df['propertyType'].fillna('')
df['propertyType'] = df['propertyType'].apply(propertyType_format)

In [None]:
df['private pool'] = df['private pool'].fillna('no')
df['private pool'] = df['private pool'].apply(lambda pool: 1 if pool.lower() == 'yes' else 0)

df['PrivatePool'] = df['PrivatePool'].fillna('no')
df['PrivatePool'] = df['PrivatePool'].apply(lambda pool: 1 if pool.lower() == 'yes' else 0)

df['PrivatePool'] = df['private pool'] | df['PrivatePool']
df.drop(['private pool'], axis=1, inplace=True)

In [None]:
df['baths'] = df['baths'].fillna(0)
df['baths'] = df['baths'].apply(features_float_format)

df['sqft'] = df['sqft'].fillna(0)
df['sqft'] = df['sqft'].apply(features_float_format)

df['beds'] = df['beds'].fillna(0)
df['beds'] = df['beds'].apply(features_float_format)

df['stories'] = df['stories'].fillna(0)
df['stories'] = df['stories'].apply(features_float_format)

df['fireplace'] = df['fireplace'].fillna(0)
df['fireplace'] = df['fireplace'].apply(fireplace_format)

In [None]:
df['homeFacts'] = df['homeFacts'].apply(eval) # конвертируем строку в словарь, используя eval

In [None]:
def homeFacts_format(homeFacts):
    homeFacts = homeFacts.get('atAGlanceFacts', 0)
    if homeFacts == 0: 
        return 0
    homeFacts = homeFacts[0]
    if homeFacts.get('factLabel') == 'Year built':
        homeFacts = homeFacts.get('factValue')
    try:
        homeFacts = int(homeFacts)
    except:
        homeFacts = 0
    return homeFacts

In [None]:
df['yearBuilt'] = df['homeFacts'].apply(homeFacts_format)

In [None]:
df['schools'] = df['schools'].apply(eval) # конвертируем строку в словарь, используя eval

In [None]:
def rating_format(schools):
    schools = schools[0]
    schools = schools.get('rating', 0)
    if schools == 0: 
        return 0
    
    rating = []
    for x in schools:
        x = re.sub('/10','',x) # если рейтинг записан в формате 'x/10', заменяем на 'x'
        rating.append(features_float_format(x))
    
    return np.mean(rating)

In [None]:
df['rating'] = df['schools'].apply(rating_format)
df['rating'] = df['rating'].fillna(0)

In [None]:
def distance_format(schools):
    schools = schools[0]
    schools = schools.get('data', 0)
    if schools == 0: 
        return 0
    schools = schools.get('Distance', 0)
    if schools == 0: 
        return 0
    
    distance=[]
    for x in schools:
        distance.append(features_float_format(x))
    return np.mean(distance)

In [None]:
df['distance'] = df['schools'].apply(distance_format)
df['distance'] = df['distance'].fillna(0)

In [None]:
df.drop(drop_features, axis=1, inplace=True)# дропаем ненужные строки

In [None]:
df = pd.concat([df, pd.get_dummies(df['status'])], axis=1)
df.drop('status', axis=1, inplace=True)

df = pd.concat([df, pd.get_dummies(df['state'])], axis=1)
df.drop('state', axis=1, inplace=True)

# Статистический анализ

In [None]:
df.head()

In [None]:
df.describe(include='all')

In [None]:
corr = df.corr()

cmap = sns.diverging_palette(5, 250, as_cmap=True)

def magnify():
    return [dict(selector="th",
                 props=[("font-size", "7pt")]),
            dict(selector="td",
                 props=[('padding', "0em 0em")]),
            dict(selector="th:hover",
                 props=[("font-size", "12pt")]),
            dict(selector="tr:hover td:hover",
                 props=[('max-width', '200px'),
                        ('font-size', '12pt')])
]

corr.style.background_gradient(cmap, axis=1)\
    .set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
    .set_caption("Hover to magify")\
    .set_precision(2)\
    .set_table_styles(magnify())

In [None]:
np.log(df['target']+1).hist(bins = 15)

In [None]:
np.log(df['sqft']+1).hist(bins = 15)

In [None]:
np.log(df['beds']+1).hist(bins = 15)

In [None]:
np.log(df['stories']+1).hist(bins = 15)

In [None]:
df['rating'].hist()

In [None]:
np.log(df['distance']+1).hist()

In [None]:
df.corr()['target'].sort_values(ascending = False)[:10]

In [None]:
plt.scatter(df['rating'],df['target'])

# Построение моделей

Разобьем данные на обучающаю и тестовую выборки:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

Сделаем функцию для оценки точности предсказаний:

In [None]:
def evaluate(clf, X_train, y_train, X_test, y_test):
    
    train_predict = clf.predict(X_train)
    test_predict = clf.predict(X_test)
    
    print('train rmse / mean:', np.sqrt(mean_squared_error(y_train, train_predict) ) / np.mean(y_train))
    print('test rmse / mean:',np.sqrt(mean_squared_error(y_test, test_predict) ) / np.mean(y_test))
    #Coefficient of determination
    print('train R^2', clf.score(X_train,y_train)) 
    print('test R^2', clf.score(X_test,y_test))

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
evaluate(lr,X_train, y_train, X_test, y_test)

Коэффициент детерминации ниже 0.1, а значит модель очень плохо предсказывает результат. Попробуем различные деревья решений:

In [None]:
tree = DecisionTreeRegressor(max_depth  = 7 ,random_state = 42)
tree.fit(X_train,y_train)

In [None]:
evaluate(tree,X_train, y_train, X_test, y_test)

In [None]:
tree8 = DecisionTreeRegressor(max_depth  = 8 ,random_state = 42)
tree8.fit(X_train,y_train)
evaluate(tree8,X_train, y_train, X_test, y_test)

In [None]:
tree6 = DecisionTreeRegressor(max_depth  = 6 ,random_state = 42)
tree6.fit(X_train,y_train)
evaluate(tree6,X_train, y_train, X_test, y_test)

Итог: при максимальной глубине меньше 7 модель недообучается, а при максимальной глубине больше 7 - переобучается. 

Попробуем различные ансамбли:

In [None]:
bagging_trees = BaggingRegressor(tree, n_estimators = 8)
bagging_trees.fit(X_train,y_train)

In [None]:
evaluate(bagging_trees,X_train, y_train, X_test, y_test)

In [None]:
random_forest = RandomForestRegressor(n_estimators = 8, random_state = 42)
random_forest.fit(X_train,y_train)

In [None]:
evaluate(random_forest,X_train, y_train, X_test, y_test)

In [None]:
adaboost = AdaBoostRegressor(tree,n_estimators = 8)
adaboost.fit(X_train,y_train)

In [None]:
evaluate(adaboost,X_train, y_train, X_test, y_test)

Итог: random forest справляется лучше всех, имея коэффициент детерминации 0.566 на тестовой выборке.