### Importing Libraries

In [99]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

%matplotlib inline

### Reading Dataset

In [100]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

### Looking at the high missing value features

In [101]:
## Function to find those columns with more than 70% missing values
def missing_ratio(df):
    col_list = [] # col with missing value more than 70%
    for c in df.columns:
        if(df[c].isnull().sum() >= 0.7 * df.shape[0]):
            col_list.append(c)
            
    return col_list
    

In [102]:
missing_val = missing_ratio(train)
missing_val

In [103]:
## dropping the high missing value columns from dataframe
train.drop(missing_val, axis = 1, inplace = True)
test.drop(missing_val, axis  =1 , inplace = True)

### Handling rest of the Missing Values

In [104]:
## function to find num col and cat col
def num_col(df):
    return [c for c in train.columns if train[c].dtype != object]
def cat_col(df):
    return [c for c in train.columns if train[c].dtype == object]

In [105]:
## adding the saleprice column to maintain unity
test['SalePrice'] = -1

In [106]:
Unnecessary_col = ["Id" ]
# These are numerical data but LabelEncoded
Label_Encoded = ['MSSubClass','OverallQual','OverallCond'] ## Found form Description.txt

In [107]:
## These are the date time columns
Year_col = []
for c in train.columns:
    if "Year" in c or 'Yr' in c :
        Year_col.append(c)

In [108]:
Year_col

In [109]:
## converting the yearcols 
for c in Year_col:
    train[c] = train['YrSold'] - train[c] 
    test[c] = test['YrSold'] - test[c]
train.drop(['YrSold'], axis =1 , inplace = True)
test.drop(['YrSold'], axis =1 , inplace = True)

In [110]:
train.drop(Unnecessary_col, axis = 1, inplace = True)
test.drop(Unnecessary_col, axis = 1, inplace = True)

In [111]:
## Filling all the num missing values with mean of that column
for c in num_col(train): 
    if c not in Label_Encoded:
        val = train[c].mean()
        train[c] = train[c].fillna(val)
        test[c] = test[c].fillna(val)

In [112]:
## Converting all the Label_Encoded col into String
for c in Label_Encoded:
    train[c] = train[c].astype(str)

In [113]:
### Fill all the missing categorical value with the mode of that column
for c in cat_col(train):
    val = train[c].mode()[0]
    train[c] = train[c].fillna(val)
    test[c] = test[c].fillna(val)

In [114]:
sum(train.isnull().sum() != 0)

In [115]:
sum(test.isnull().sum() != 0)

### Feature Engineering 

In [116]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [117]:
num_feats = num_col(train)
x_train = train[num_feats]
x_train = x_train.drop(['SalePrice'], axis = 1)
y_train = train[['SalePrice']]


In [118]:
x_train.shape

In [119]:
features_model = SelectFromModel(Lasso(alpha = 0.005, random_state = 42))
features_model.fit(x_train, y_train)

In [120]:
arr = features_model.get_support()

In [121]:
selected_features = x_train.columns[arr]
print(x_train.shape[1])
print(len(selected_features))
print(np.sum(features_model.estimator_.coef_ == 0))

##### All the numerical col contributes

### Normalizatiion


In [122]:
num_feats = num_col(train)
sale_mean = train['SalePrice'].mean() 
sale_std = train['SalePrice'].std()
for c in num_feats:
    mean_val = train[c].mean() 
    std_val = train[c].std()
    train[c] = train[c].map(lambda x: (x-mean_val)/std_val)
    test[c] = test[c].map(lambda x: (x - mean_val)/std_val)
    


### Transform the cat feats into num feats

In [123]:
print(train.shape, test.shape)

#### Concating the test data with train data as test data has more categorical type than train data

In [124]:
total_data = pd.concat([train, test]).reset_index(drop = True)

In [125]:
total_data.shape

In [126]:
## Converting the categorical variables into numerical with One Hot Encoding
def change(df,cat_feats):
    output_df = df.copy()
    for c in cat_feats:
        dummies = pd.get_dummies(output_df[c], drop_first = True)
        output_df[dummies.columns] = dummies
        output_df.drop([c], axis = 1, inplace = True)  
    return output_df

In [127]:
cat_feats = cat_col(total_data)


In [128]:
Final_data = change(total_data, cat_feats)

In [129]:
total_data.shape

In [130]:
Final_data.shape

#### So, Now we have 222 columns and all of them are numerical

### Removing Duplicate Columns

In [131]:
Final_data = Final_data.loc[:, ~Final_data.columns.duplicated()]

In [132]:
Final_data.shape

#### No Duplicate cols found

### Train and test

In [133]:
## Spliting the data again
train = Final_data.iloc[:1460, :].reset_index(drop = True)
test = Final_data.iloc[1460:, :].reset_index(drop = True)
print(train.shape, test.shape)

In [134]:
sum(train.isnull().sum() != 0)

In [135]:
cat_col(train)

### create cross validation

In [136]:
from sklearn.model_selection import KFold

In [137]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
train['kfold'] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(train)):
    train.loc[val_idx, "kfold"] = fold


### Create Model

In [138]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import ExtraTreeRegressor
import xgboost as xgb
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

In [139]:
from sklearn.model_selection import RandomizedSearchCV

In [140]:
models = {
    "rf": RandomForestRegressor(n_estimators= 600, min_samples_split= 3, min_samples_leaf= 1),
    "ExtraTrees": ExtraTreeRegressor(),
    "xgb":  xgb.XGBRegressor(n_estimators= 1000, max_depth= 3, learning_rate= 0.1, colsample_bytree= 0.3, verbosity = 0),
    "gbr": GradientBoostingRegressor()
}

### HyperParameter Optimization

In [141]:
params = {
    "rf":{"n_estimators": np.arange(100, 700, 100),
    "min_samples_split": [2,3,4,5],
    "min_samples_leaf":[1,2,3],
    },
    
    "xgb":{'max_depth': [3,6,10],
               'learning_rate': [0.01, 0.05, 0.1],
               'n_estimators': [100, 500, 1000],
               'colsample_bytree': [0.3, 0.7]  
    },
}

In [142]:
x_train = train.drop(['SalePrice'], axis =1)
y_train = train['SalePrice']

In [143]:

clf = RandomizedSearchCV(models["rf"], params["rf"], scoring = "r2", cv = 5, n_jobs = -1, verbose = 0)
clf.fit(x_train, y_train)
print(model, "--", clf.best_score_, "---", clf.best_params_)
print("-"*15)

In [None]:
clf = RandomizedSearchCV(models["xgb"], params["xgb"], scoring = "r2", cv = 5, n_jobs = -1, verbose = 1)
clf.fit(x_train.values, y_train.values)
print(model, "--", clf.best_score_, "---", clf.best_params_)
print("-"*15)

### Training

In [None]:
def run(fold, model_name):
    
    train_data = train[train['kfold'] != fold]
    val_data = train[train['kfold'] == fold]
    
    model = models[model_name]
    x_train = train_data.drop(['kfold','SalePrice'], axis =1 )
    y_train = train_data['SalePrice']
    

    x_val = val_data.drop(['kfold', 'SalePrice'], axis =1 )
    y_val =  val_data['SalePrice']
    
    model.fit(x_train.values, y_train.values)
    
    pred = model.predict(x_val)
    mse = metrics.r2_score(y_val, pred)
    
    joblib.dump(model, '/kaggle/working/' + f"{model_name}_{fold}.pkl")
    
    print(f"fold:{fold}  model:{model_name} score:{mse}")

In [None]:
for fold in range(5):
    run(fold, "rf")

In [None]:
for fold in range(5):
    run(fold, "xgb")

In [None]:
test.drop(['SalePrice'], axis = 1, inplace = True)

In [None]:
test.shape

In [None]:

model1 = joblib.load("/kaggle/working/xgb_2.pkl")
model2 = joblib.load("/kaggle/working/RandomForestRegressor(min_samples_split=3, n_estimators=600)_2.pkl")
pred1 = model1.predict(test)
pred2 = model2.predict(test)
prediction = (pred1 + pred2)/2

In [None]:
prediction.shape

In [None]:
sub = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sub['SalePrice'] =  ((prediction * sale_std) + sale_mean)


sub.head()

In [None]:
sub.to_csv('submission.csv', index = False)