In [None]:
import math
import pandas as pd
import numpy as np
from scipy.stats import stats, norm, skew
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasRegressor
from scipy.special import boxcox1p
import lightgbm as lgb
import xgboost as xgb
from fastprogress import master_bar, progress_bar
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV

%matplotlib inline
np.random.seed(2)
plt.style.use('ggplot') 
matplotlib.rcParams['figure.figsize'] = (10,10)
plt.rcParams['font.size'] = 20 #font size
plt.rcParams['axes.linewidth'] = 1.5 #axis setting

In [None]:
# Load the data
train = pd.read_csv("./train_radius.csv")
test = pd.read_csv("./test_radius.csv")
print(train.shape)
print(test.shape)

In [None]:
train = train.drop(['Id',"SQUARE","X","Y"], axis=1)
test = test.drop(["SQUARE","X","Y"], axis=1)

train["GBA"] = np.log1p(train["GBA"])
test["GBA"] = np.log1p(test["GBA"])
train["LIVING_GBA"] = np.log1p(train["LIVING_GBA"])
test["LIVING_GBA"] = np.log1p(test["LIVING_GBA"])

train.columns

In [None]:
train.dtypes

In [None]:
cols_with_none_as_nan = [
    "HEAT", 
    "AC",
    "STYLE",
    "STRUCT",
    "GRADE",
    "CNDTN",
    "EXTWALL",
    "ROOF",
    "INTWALL",
    "FULLADDRESS",
    "CITY",
    "STATE",
    "NATIONALGRID",
    "ASSESSMENT_SUBNBHD",
    "CENSUS_BLOCK",
    "QUADRANT"
    ]


# fill missing text fields with a default string
object_columns = train.select_dtypes(include=[object])
test_object_columns = test.select_dtypes(include=[object])

# for these colunms the string 'None' will be inserted in place of nan
for col in cols_with_none_as_nan:
    object_columns.loc[:, col] = object_columns.loc[:, col].fillna('None')
    test_object_columns.loc[:, col] = test_object_columns.loc[:, col].fillna('None')

remaining_fix = object_columns.isnull().sum()
print('Fixes remaining on train set\n', remaining_fix[remaining_fix>0])

remaining_fix = test_object_columns.isnull().sum()
print('Fixes remaining on test set\n',remaining_fix[remaining_fix>0])

In [None]:
numeric_columns = train.select_dtypes(include=[int, float])

remaining_fix = numeric_columns.isnull().sum()
print('Fixes remaining on train set\n',remaining_fix[remaining_fix>0])

test_numeric_columns = test.select_dtypes(include=[int, float])

remaining_fix = test_numeric_columns.isnull().sum()
print('Fixes remaining on test set\n',remaining_fix[remaining_fix>0])

In [None]:
cols_with_zero_as_nan = ['CMPLX_NUM',
                         'YR_RMDL',
                         'KITCHENS',
                         'NUM_UNITS',
                         'STORIES',
                         "AYB",
                         "GBA",
                         "LIVING_GBA",
                         "GBA_ROOMS",
                         "LIVING_GBA_ROOMS",
                         "LANDAREA_ROOMS"
                        ]

cols_with_mean_as_nan = [
                        ]


# for these colunms a zero will be inserted in place of nan
for col in cols_with_zero_as_nan:
    numeric_columns.loc[:, col] = numeric_columns.loc[:, col].fillna(0)
    test_numeric_columns.loc[:, col] = test_numeric_columns.loc[:, col].fillna(0)
    
# for these colunms the mean will be inserted in place of nan
for col in cols_with_mean_as_nan:
    numeric_columns.loc[:, col] = numeric_columns.loc[:, col].fillna(numeric_columns[col].mean())
    test_numeric_columns.loc[:, col] = test_numeric_columns.loc[:, col].fillna(test_numeric_columns[col].mean())


remaining_fix = numeric_columns.isnull().sum()
print('Fixes remaining on train set\n',remaining_fix[remaining_fix>0])

remaining_fix = test_numeric_columns.isnull().sum()
print('Fixes remaining on test set\n',remaining_fix[remaining_fix>0])

object_dm = pd.get_dummies(object_columns, drop_first=True, dummy_na=True)
test_object_dm = pd.get_dummies(test_object_columns, drop_first=True, dummy_na=True)

In [None]:
def fix_skewness(dataframe):
    skewed_feats = dataframe.apply(lambda x: abs(skew(x.dropna()))).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew': skewed_feats})
    skewness = skewness[abs(skewness) > 0.75].dropna()
    print("There are {} skewed numerical features to transform".format(skewness.shape[0]))
    print("\nSkew > .75 in numerical features: \n")
    print(skewness)
    # Fix skewness
    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
        if feat == 'PRICE': continue
        dataframe[feat] = boxcox1p(dataframe[feat], lam)
    print('Fixed skewness')

fix_skewness(numeric_columns)
fix_skewness(test_numeric_columns)

In [None]:
strings = np.array([])
for c in object_columns.columns:
    strings = np.append(strings, pd.unique(object_columns[c].values))

for c in test_object_columns.columns:
    strings = np.append(strings, pd.unique(test_object_columns[c].values))

print(len(strings), 'distinct labels generated')

labeler = LabelEncoder()
labeler.fit(strings.astype("str"))

for c in object_columns.columns:
    object_columns.loc[:,c] = labeler.transform(object_columns.loc[:,c])
    test_object_columns.loc[:,c] = labeler.transform(test_object_columns.loc[:,c])

object_columns[0:5]

In [None]:
# final train dataset
train_ds = object_columns.join(numeric_columns)
train_ds["PRICE"] = np.log1p(train_ds["PRICE"])

# test dataset
test_ds = test_object_columns.join(test_numeric_columns)

#correlation matrix
corrmat = train_ds.corr()['PRICE']
print(corrmat)
#f, ax = plt.subplots(figsize=(12, 9))
#sns.heatmap(corrmat, vmax=.8, square=True);

best_columns = corrmat[abs(corrmat) > 0.0].index
train_ds = train_ds[best_columns]
test_ds = test_ds[best_columns.drop('PRICE')]
best_columns

train_ds = pd.concat([train_ds, object_dm], axis=1)
test_ds = pd.concat([test_ds, test_object_dm], axis=1)

In [None]:
X_train = (train_ds.values[:,:-1])
y_train = np.asarray([[t] for t in (train_ds.values[:,-1])])
X_test = test_ds.values

print('Training set features shape', X_train.shape)
print('Training set labels shape', y_train.shape)
print('Test set shape', test_ds.shape)

In [None]:
from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X_train)
X_train = pd.DataFrame(transformer.transform(X_train))
X_test = pd.DataFrame(transformer.transform(X_test))

In [None]:
def train_model(depth, learning_rate, n_estimators, model_type="xgb"):
    if model_type == 'xgb':
        model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                                 learning_rate=learning_rate, max_depth=depth, 
                                 min_child_weight=1.7817, n_estimators=n_estimators,
                                 reg_alpha=0.4640, reg_lambda=0.8571,
                                 subsample=0.5213, silent=1,
                                 random_state =7, nthread = -1)
    if model_type == 'lgb':
        model = lgb.LGBMRegressor(colsample_bytree=0.4603, min_gain_to_split=0.0468, 
                                 learning_rate=learning_rate, max_depth=depth, 
                                 min_child_weight=1.7817, n_estimators=n_estimators,
                                 reg_alpha=0.4640, reg_lambda=0.8571,
                                 subsample=0.5213, silent=1,
                                 random_state =7, nthread = -1)
        
    score = rmsle_cv(model)
    print(model_type, " score: depth={:d} lr={:.2f} est={:d} -> mean:{:.5f} std:{:.4f}".format(depth, learning_rate, n_estimators, score.mean(), score.std()))
    return score

#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train)
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train.flatten(), scoring="neg_mean_squared_error", cv = kf))
    return(rmse)


def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

result = []
for depth in range(2, 6):
    for learning_rate in range(100, 500, 100):
        for n_estimators in range(4000, 6000, 500):
            score = train_model(depth, learning_rate/100, n_estimators, model_type='lgb')
            result.append([depth, learning_rate/100, n_estimators, score.mean(), score.std()])

# check results
result = pd.DataFrame(result, columns=['depth', 'learning_rate', 'n_estimators', 'score_mean', 'score_std'])
result.describe()

best = np.argmin(result['score_mean'].values)
print('Best params = \n', result.iloc[best])

In [1]:
# train with the best parameters

#depth = int(result.iloc[best]['depth'])
#learning_rate = result.iloc[best]['learning_rate']
#n_estimators = int(result.iloc[best]['n_estimators'])


depth = 5
learning_rate = 0.5
n_estimators = 1000

model_lgb = lgb.LGBMRegressor(num_leaves=8,colsample_bytree=0.4603,learning_rate=learning_rate, max_depth=depth, 
                         min_child_weight=1.7817, n_estimators=n_estimators,
                         reg_alpha=0.4640, reg_lambda=0.8571,
                         subsample=0.5213, silent=1,
                         random_state = 6, nthread = -1)

score = rmsle_cv(model_lgb)
print("lgb score: depth={:d} lr={:.2f} est={:d} -> mean:{:.5f} std:{:.4f}".format(depth, learning_rate, n_estimators, score.mean(), score.std()))

model_lgb.fit(X_train, y_train.flatten())
y_pred = model_lgb.predict(X_train)

print('RMSLE LGB = ', rmsle(y_train, y_pred))

NameError: name 'lgb' is not defined

In [None]:
print(model_lgb)

plt.scatter(np.expm1(y_train), np.expm1(y_pred))
#plt.xlim(0, 100_000_00)
#plt.ylim(0, 100_000_00)
plt.xlabel('actual')
plt.ylabel('predicted')
plt.grid()
#plt.plot([(0, 0), (10_000_000, 10_000_000)], [(0, 0), (10_000_000, 10_000_000)])
plt.show()

predictors = [x for x in train_ds.columns if x not in ["PRICE", "Id"]]
feat_imp = pd.Series(model_lgb.feature_importances_, predictors).sort_values(ascending=False)
sns.set_palette("husl")
sns.barplot( feat_imp.head(20).values,feat_imp.head(20).index)
plt.title('Top10 Feature Importances')
plt.xlabel('Feature Importance Score')
plt.show()

result = []
for depth in range(2, 6):
    for learning_rate in range(100, 500, 100):
        for n_estimators in range(4000, 6000, 500):
            score = train_model(depth, learning_rate/100, n_estimators, model_type='lgb')
            result.append([depth, learning_rate/100, n_estimators, score.mean(), score.std()])

# check results
result = pd.DataFrame(result, columns=['depth', 'learning_rate', 'n_estimators', 'score_mean', 'score_std'])
result.describe()

best = np.argmin(result['score_mean'].values)
print('Best params = \n', result.iloc[best])

In [None]:
# train with the best parameters

#depth = int(result.iloc[best]['depth'])
#learning_rate = result.iloc[best]['learning_rate']
#n_estimators = int(result.iloc[best]['n_estimators'])


depth = 5
learning_rate = 0.5
n_estimators = 1000

model_xgb = xgb.XGBRegressor(learning_rate=learning_rate, max_depth=depth, 
                         min_child_weight=1.7817, n_estimators=n_estimators,silent=1,
                         random_state =7, nthread = -1)



score = rmsle_cv(model_xgb)
print("XGB score: depth={:d} lr={:.2f} est={:d} -> mean:{:.5f} std:{:.4f}".format(depth, learning_rate, n_estimators, score.mean(), score.std()))

model_xgb.fit(X_train, y_train.flatten())
y_pred = model_xgb.predict(X_train)

print('RMSLE XGB = ', rmsle(y_train, y_pred))

In [None]:
plt.scatter(np.expm1(y_train), np.expm1(y_pred))
#plt.xlim(0, 100_000_000)
#plt.ylim(0, 100_000_000)
plt.xlabel('actual')
plt.ylabel('predicted')
plt.grid()
plt.plot([(0, 0), (10_000_000, 10_000_000)], [(0, 0), (10_000_000, 10_000_000)])
plt.show()

predictors = [x for x in train_ds.columns if x not in ["PRICE", "Id"]]
feat_imp = pd.Series(model_lgb.feature_importances_, predictors).sort_values(ascending=False)
sns.set_palette("husl")
sns.barplot( feat_imp.head(20).values,feat_imp.head(20).index)
plt.title('Top10 Feature Importances')
plt.xlabel('Feature Importance Score')
plt.show()

In [None]:
avg_predict = (model_lgb.predict(X_train) + model_xgb.predict(X_train)) / 2
y_pred = avg_predict

plt.figure(figsize=(8, 8))
plt.scatter(np.expm1(y_train), np.expm1(y_pred))
plt.xlim(0, 200_000_000)
plt.ylim(0, 200_000_000)
plt.xlabel('actual', fontsize=26)
plt.ylabel('predicted', fontsize=26)
plt.plot([(0, 0), (10_000_000, 10_000_000)], [(0, 0), (10_000_000, 10_000_000)])
plt.show()

print('RMSLE averaged = ', rmsle(y_train, y_pred))

In [None]:
# Submission
print(np.expm1(model_xgb.predict(X_test)[0:5]))
print(np.expm1(model_lgb.predict(X_test)[0:5]))
avg_predict = (model_xgb.predict(X_test) + model_lgb.predict(X_test)) / 2
subm_predict = np.expm1(avg_predict)

dsubm_predict = pd.DataFrame()
dsubm_predict['Id'] = test.values[:,0]
dsubm_predict["PRICE"] = pd.DataFrame(subm_predict)

dsubm_predict.to_csv('submission.csv', index=False)
dsubm_predict[0:10]

In [None]:
# Submission
print(np.expm1(model_lgb.predict(X_test)[0:5]))
avg_predict =  model_lgb.predict(X_test)
dsubm_predict = np.expm1(avg_predict)

dsubm_predict = pd.DataFrame()
dsubm_predict['Id'] = test.values[:,0]
dsubm_predict["PRICE"] = pd.DataFrame(subm_predict)

dsubm_predict.to_csv('submission.csv', index=False)
dsubm_predict[0:10]

In [None]:
dsubm_predict