In [91]:
import tqdm 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import r2_score, make_scorer, mean_squared_error 


In [92]:

training = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

df_train = training.copy()
df_test = test.copy()

In [93]:
target = df_train['SalePrice']  
df_train = df_train.drop('SalePrice', axis=1) 
df_train['training_set'] = True 
df_test['training_set'] = False
df_full = pd.concat([df_train, df_test]) 

In [94]:
df_full.drop('Id', axis=1, inplace=True)

In [95]:
# corrmat = training.corr()
# plt.subplots(figsize=(10,10))
# sns.heatmap(corrmat,square=True, cmap="YlGnBu");

In [96]:
#логанем таргет
target = np.log(target)

perc_na = (df_full.isnull().sum()/len(df_full))*100
ratio_na = perc_na.sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :ratio_na})


In [98]:
#Отнормируем и уберем NaN
numeric_variables = list(df_full.select_dtypes(include=['int64', 'float64']).columns.values)
df_full[numeric_variables] = df_full[numeric_variables].apply(lambda x: x.fillna(x.median()),axis=0)

In [99]:
#Отнормируем и уберем NaN
categorial_variables = list(df_full.select_dtypes(exclude=['int64', 'float64', 'bool']).columns.values)
df_full[categorial_variables] = df_full[categorial_variables].apply(lambda x: x.fillna("None"),axis=0)

In [100]:
perc_na = (df_full.isnull().sum()/len(df_full))*100
ratio_na = perc_na.sort_values(ascending=False)
missing_data = pd.DataFrame({'missing_ratio' :ratio_na})
missing_data = missing_data.drop(missing_data[missing_data.missing_ratio == 0].index)


In [101]:

df_full = pd.get_dummies(df_full)

In [102]:
df_train = df_full[df_full['training_set']==True]
df_train = df_train.drop('training_set', axis=1)
df_test = df_full[df_full['training_set']==False]
df_test = df_test.drop('training_set', axis=1)

In [103]:
(df_train.shape, df_test.shape)

((1460, 310), (1459, 310))

In [104]:
X_train, X_test, y_train, y_test = train_test_split(df_train, target, random_state=42)

### Random Forest

In [105]:
rf_model = RandomForestRegressor(n_estimators=110, n_jobs=-1, max_depth = 15, min_samples_leaf=2)

In [106]:
rf_model.fit(X_train, y_train)
y_predict = rf_model.predict(X_test)

###  XGBoost

In [107]:
import xgboost as xgb
xgb_regressor = xgb.XGBRegressor(random_state=42)

In [108]:
xgb_opt = xgb.XGBRegressor(
    learning_rate = 0.05,
    max_depth = 5,
    min_child_weight = 1.5,
    n_estimators = 7500,                                                                  
    seed = 42,
    silent = 1)

In [109]:
xgb_opt.fit(X_train, y_train)
xgb_opt_predict = xgb_opt.predict(X_test) 

  if getattr(data, 'base', None) is not None and \


### Lasso Regressor

In [110]:
from sklearn.linear_model import Lasso
lasso_regr = Lasso(random_state=42, max_iter = 30000, alpha = 0.001)

In [111]:
lasso_regr.fit(X_train, y_train)
lasso_opt_predict = lasso_regr.predict(X_test) 

### Сравним

In [112]:
rf_r2 = r2_score(y_test, y_predict)
rf_mse = mean_squared_error(y_test, y_predict)

xgb_r2 = r2_score(y_test, xgb_opt_predict)
xgb_mse = mean_squared_error(y_test, xgb_opt_predict)

lasso_r2 = r2_score(y_test, lasso_opt_predict)
lasso_mse = mean_squared_error(y_test, lasso_opt_predict)

In [113]:

d = {'RandomForest': [rf_r2, rf_mse], 
     'XGBoost': [xgb_r2, xgb_mse], 
     'Lasso': [lasso_r2, lasso_mse]}
d_i = ['R2', 'Mean Squared Error']
df_results = pd.DataFrame(data=d, index = d_i)
df_results

Unnamed: 0,RandomForest,XGBoost,Lasso
R2,0.884738,0.886499,0.899444
Mean Squared Error,0.019959,0.019654,0.017412
