# Load train and test datasets already modified after R analysis

After R correlation analysis with multiple regression between all features and Sale Price, we have found that the best model is the one that includes the following features:
MSZoning + LotArea + Street + LandContour + LotConfig + 
                   LandSlope + Neighborhood + Condition1 + BldgType + 
                   HouseStyle + OverallQual + OverallCond + YearBuilt + YearRemodAdd + 
                   RoofStyle + Foundation + BsmtQual + BsmtCond + 
                   BsmtExposure + BsmtUnfSF + TotalBsmtSF + Heating + HeatingQC + 
                   CentralAir + FullBath + BedroomAbvGr + KitchenQual + Functional + 
                   GarageType + GarageArea + SaleType + SaleCondition + FireplaceQu + PoolQC

In [None]:
# Import libraries

# Pandas 
import pandas as pd
from pandas import Series,DataFrame 

# Numpy and Matplotlib
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
#sns.set_style('whitegrid')
%matplotlib inline

# Machine Learning 
from sklearn import preprocessing

df_trainfull = pd.read_csv('train_v1.csv', index_col=0)
df_test = pd.read_csv('test.csv', index_col=0)
df_trainfull.head()

In [None]:
df_test.head()

# Convert categorical to numerical features
From this dataset, convert the categorical to numerical features

In [None]:
# It helps the recognition of cat features
cat_df_trainfull = df_trainfull.select_dtypes(include=['object'])
cat_df_trainfull

In [None]:
#Convert cat into num for trainfull
df_trainfull_cat_to_num = pd.get_dummies(df_trainfull)
df_trainfull_cat_to_num

In [None]:
export_csv = df_trainfull_cat_to_num.to_csv ('trainfull_all_num.csv', header=True)

In [None]:
#Convert cat into num for test
df_test_cat_to_num = pd.get_dummies(df_test)
df_test_cat_to_num

In [None]:
export_csv = df_test_cat_to_num.to_csv ('test_all_num.csv', header=True)

# Analysis of the new dataset

In [None]:
train = pd.read_csv('trainfull_all_num.csv', header=0)
test = pd.read_csv('test_all_num.csv', header=0)
test = test.fillna(0)
train.head()

In [None]:
test.head()

In [None]:
id_col = test['Id'].values.tolist()

In [None]:
types = train.dtypes
print(types)

In [None]:
types = test.dtypes
print(types)

In [None]:
#find the corretation between the feature and target
pd.set_option('display.max_rows', train.shape[0]+1)
corr = train.corr()
print (corr['SalePrice'].sort_values(ascending=False)[:], '\n')


From this correlation we can try two models with differents features:
* 1- corr < 0.5 :OverallQual, GarageArea, TotalBsmtSF, FullBath, BsmtQual_Ex, YearBuilt, YearRemodAdd, KitchenQual_Ex        
* 2- corr < 0.6 :OverallQual, GarageArea, TotalBsmtSF

### Analysis of correlated features

In [None]:
#OverallQual
train.OverallQual.unique()
#Creating a pivot table 
quality_pivot = train.pivot_table(index='OverallQual',values='SalePrice', aggfunc=np.median)
quality_pivot

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook

quality_pivot.plot(kind='bar',color='blue')
plt.xlabel('Overall Quality')
plt.ylabel('Median')
plt.xticks(rotation=0)
plt.show()

In [None]:
# GarageArea
%matplotlib notebook
sns.regplot(x='GarageArea',y='SalePrice',data=train)

In [None]:
# GarageArea
%matplotlib notebook
sns.regplot(x='TotalBsmtSF',y='SalePrice',data=train)

In [None]:
#Removing the null values
nulls = pd.DataFrame(train.isnull().sum().sort_values(ascending=False)[:25])
nulls.columns = ['Null Count']
nulls.index.name = 'Feature'
nulls

# 1) Prediction with Keras for feautures with correlation > 0.5

In [None]:
 pip install keras

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler # Used for scaling of data
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from keras import backend as K
from keras.wrappers.scikit_learn import KerasRegressor

cols = ['SalePrice','OverallQual','GarageArea','TotalBsmtSF','FullBath','BsmtQual_Ex','YearBuilt','YearRemodAdd','KitchenQual_Ex']
df_train = train[cols]

# Always standard scale the data before using NN
scale = StandardScaler()
X_train = df_train[['OverallQual','GarageArea','TotalBsmtSF','FullBath','BsmtQual_Ex','YearBuilt','YearRemodAdd','KitchenQual_Ex']]
X_train = scale.fit_transform(X_train)
# Y is just the 'SalePrice' column
y = df_train['SalePrice'].values
seed = 7
np.random.seed(seed)
# split into 67% for train and 33% for test
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.33, random_state=seed)

In [None]:
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(40, activation='relu'))
    model.add(Dense(1))
    # Compile model
    model.compile(optimizer ='adam', loss = 'mean_squared_error', 
              metrics =[metrics.mae])
    return model

In [None]:
model = create_model()
model.summary()

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=150, batch_size=32)

In [None]:
%matplotlib notebook
# summarize history for accuracy
plt.plot(history.history['mean_absolute_error'])
plt.plot(history.history['val_mean_absolute_error'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
# summarize history for loss
%matplotlib notebook
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
cols = ['OverallQual','GarageArea','TotalBsmtSF','FullBath','BsmtQual_Ex','YearBuilt','YearRemodAdd','KitchenQual_Ex']
df_test = test[cols]
X_test = df_test[cols].values

# Always standard scale the data before using NN
scale = StandardScaler()
X_test = scale.fit_transform(X_test)

In [None]:
prediction = model.predict(X_test)

In [None]:
prediction

In [None]:
submission = pd.DataFrame()
submission['Id'] = id_col
submission['SalePrice'] = prediction

In [None]:
submission.to_csv('submission_2.csv', index=False)

In [None]:
import kaggle
kaggle.api.competition_submit("submission_2.csv", "test_2", "house-prices-advanced-regression-techniques")

In [None]:
score= 0.19288
print("Kaggle score:", score)