In [None]:
import numpy as np
import pandas as pd

import seaborn as sn
import matplotlib.pyplot as plt
from IPython.display import display

from sklearn import preprocessing
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm



In [None]:
data = pd.read_csv("train.csv")
data = data.drop('Id', axis = 1)
#display(data.head(n=20))

raw_features = data.drop("SalePrice", axis = 1)
labels = data["SalePrice"]
#display(raw_features.head(n=10))


In [None]:
#raw_features.dtypes

In [None]:
def rmsle(ypred, ytest) : 
    assert len(ytest) == len(ypred)
    return np.sqrt(np.mean((np.log1p(ypred) - np.log1p(ytest))**2))

In [None]:
def solve_nan_values(data):
    limit = int(len(data) * 0.75)
    
    for columns in data.columns.tolist():
        if pd.isnull(data[columns]).any(axis = 0) == True:
            no_of_na = 0
            for value in data[columns]:
                if pd.isna(value) == True:
                    no_of_na += 1

            #print(columns)
            #print("{} missing values from {}.".format(no_of_na, int(len(data))))

            if no_of_na > limit:
                #print("Drop")
                data.drop(columns, axis = 1, inplace = True)
            else:
                #print("Replace")
                value = data[columns].value_counts().index.tolist()
                data = data.fillna({columns: value[0]})
    
    
    return data
        

In [None]:
raw_features = solve_nan_values(data = raw_features)
#raw_features.head(n = 10)

In [None]:
categorical_data = raw_features.select_dtypes(include = ['object']).copy()
#categorical_data.head(n = 20)

for columns in categorical_data.columns.tolist():
    categories = categorical_data[columns]
    categorical_data.drop(columns, axis = 1, inplace = True)
    dummy = pd.get_dummies(categories)
    categorical_data = pd.concat([categorical_data, dummy], axis = 1)
    

#categorical_data.head(n = 20)

In [None]:
raw_features.drop(raw_features.select_dtypes(['object']), inplace = True, axis = 1)

raw_features = pd.concat([raw_features, categorical_data], axis = 1)

corr = raw_features.corr()
#sn.heatmap(corr)

# Create correlation matrix
corr_matrix = raw_features.corr().abs()

#print(corr_matrix)

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_high = [column for column in upper.columns if any(upper[column] > 0.90)]

# Drop features 
raw_features.drop(to_high, axis = 1, inplace = True)

#raw_features.head(n = 10)


In [None]:
#features = preprocessing.scale(raw_features)


x = raw_features.iloc[:,:-1].values
standard_scaler = preprocessing.StandardScaler()
features = standard_scaler.fit_transform(x)

from sklearn import decomposition
import seaborn as sn

pca = decomposition.PCA()
pca.n_components = 2
pca_data = pca.fit_transform(features)


#print("Shape of PCA = ", pca_data.shape)

pca_data = np.vstack((pca_data.T, labels)).T

#print("Shape of PCA = ", pca_data.shape)

pca_df = pd.DataFrame(data = pca_data, columns = ("First Principal", "Second Principal", "Label"))

for idx in range(len(pca_df)):
    for counter in range(100):
        lower_price = counter * 10000
        upper_price = (counter + 1) * 10000
        if pca_df.at[idx, "Label"] > lower_price and pca_df.at[idx, "Label"] < upper_price:
            pca_df.at[idx, "Label"] = upper_price
            break

sn.FacetGrid(pca_df, hue = "Label", size = 10).map(plt.scatter, "First Principal", "Second Principal").add_legend()
plt.show()

X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size = 0.75, test_size = 0.25, random_state = 42)





In [None]:
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))


def random_forest_param_selection(X, y):
    
    rfc = RandomForestClassifier(random_state=42)
    
    param_grid = 
    {
        'n_estimators': [750, 1000],
        'max_depth' :  [50, 100],
    }
    
    folds = KFold(n_splits = 5, random_state = 42)
    scorer = make_scorer(rmsle, greater_is_better=False)
    clf = GridSearchCV(rfc, param_grid, cv = folds, scoring = scorer)
    cross_score = cross_val_score(estimator = clf, X = X, y = y, cv = folds)
    meanScore = np.average(cross_score)
    print("Training Score: ", meanScore)
    
    clf.fit(X, y)
    print('Best score: {}'.format(clf.best_score_))
    print('Best parameters: {}'.format(clf.best_params_))
    
    return clf.best_params_


#dictionary = random_forest_param_selection(X_train, y_train)
#n_estimators = dictionary['n_estimators']
#max_depth = dictionary['max_depth']
#n_estimators = n_estimators, max_depth = max_depth
clf1 = RandomForestClassifier(n_estimators = 1000, max_depth = 50)
ada = AdaBoostClassifier(n_estimators = 20, base_estimator = clf1, learning_rate=0.5)
ada.fit(X_train,y_train)
y_predict = ada.predict(X_test)
error = rmsle(y_test, y_predict)
print ("RandomForestClassifier Error:",error)
#RandomForestClassifier Error: 0.3507037397855725

'''Training set has 1095 samples.
Testing set has 365 samples.
Training Score:  -0.19904423237553598
Best score: -0.1970983381825009
Best parameters: {'max_depth': 50, 'n_estimators': 1000}
RandomForestClassifier Error: 0.1881650096768023'''

#RandomForestClassifier Error: 0.1733914972370594

#0.1858308350826308
#0.1830331837122414
#0.23818856495254281

In [None]:
def knn_param_selection(X, y):
    print("START")
    k_range = list(range(1, 100))
    weight_options = ["uniform", "distance"]
    
    param_grid = dict(n_neighbors = k_range, weights = weight_options)
    algorithm = KNeighborsClassifier()
    
    folds = KFold(n_splits = 5, random_state=42)
    scorer = make_scorer(rmsle, greater_is_better = False)
    clf = GridSearchCV(algorithm, param_grid, cv = folds, scoring = scorer)
    cross_score = cross_val_score(estimator = algorithm, X = X, y = y, cv = folds)
    meanScore = np.average(cross_score)
    print("Training Score: ", meanScore)
    
    clf.fit(X, y.ravel())
    print('Best score: {}'.format(clf.best_score_))
    print('Best parameters: {}'.format(clf.best_params_))
    print("STOP")
    return clf.best_params_

dictionary = knn_param_selection(X_train, y_train)
k_neighbors = dictionary['n_neighbors']
k_weights = dictionary['weights']

clf1 = KNeighborsClassifier(n_neighbors = k_neighbors, weights = k_weights)
clf1.fit(X_train,y_train)
y_predict = clf1.predict(X_test)
error = rmsle(y_test, y_predict)
print ("KNeighborsClassifier Error:", error)

#START
#Training Score:  0.0136986301369863
#Best score: -0.2502592884857419
#Best parameters: {'n_neighbors': 22, 'weights': 'distance'}
#STOP
#KNeighborsClassifier Error: 0.2628318814728741

In [None]:
def svc_param_selection(X, y):
    print("START")
    Cs = [0.01, 0.001, 0.1, 1]
    gammas = [0.01, 0.1, 0.001, 1]
    kernels = ['rbf', 'linear', 'poly', 'sigmoid']
    
    param_grid = {'C': Cs, 'gamma' : gammas, 'kernel': kernels}
    algorithm = svm.SVC()
    
    
    folds = KFold(n_splits = 5, random_state=42)
    scorer = make_scorer(rmsle, greater_is_better = False)
    clf = GridSearchCV(algorithm, param_grid, cv = folds, scoring = scorer)
    cross_score = cross_val_score(estimator = algorithm, X = X, y = y, cv = folds)
    meanScore = np.average(cross_score)
    print("Training Score: ", meanScore)
    
    clf.fit(X, y.ravel())
    print('Best score: {}'.format(clf.best_score_))
    print('Best parameters: {}'.format(clf.best_params_))
    print("STOP")
    return clf.best_params_
    

dictionary = svc_param_selection(X_train, y_train)
print(dictionary['C']) 
print(dictionary['gamma']) 
print(dictionary['kernel']) 
clf = svm.SVC(C = dictionary['C'], gamma = dictionary['gamma'], kernel = dictionary['kernel'])
clf.fit(X_train,y_train)
y_predict = clf.predict(X_test)
error = rmsle(y_test, y_predict)
print ("Support Vector Machine Error:", error)

#START
#Training Score:  0.01278538812785388
#Best score: -0.24738287881866586
#Best parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}
#STOP

#Support Vector Machine Error: 0.2156940509120464

In [None]:
def mlp_param_selection(X, y):
    print("START")
    algorithm = MLPClassifier(random_state = 42)
    param_grid = {
          'activation' : ['relu', 'identity', 'logistic', 'tanh'],    
          'solver': ['lbfgs', 'sgd', 'adam'], 
          'learning_rate' : ['adaptive', 'constant'],
          'alpha': 10.0 ** - np.arange(2, 4), 
          'hidden_layer_sizes': [700, 1000, 1200], 
          'max_iter': [1000], 
     }

    
    folds = KFold(n_splits=3, random_state=42)
    scorer = make_scorer(rmsle, greater_is_better=False)
    clf = GridSearchCV(algorithm, param_grid, cv = folds, scoring = scorer)
    cross_score = cross_val_score(estimator = clf, X = X, y = y, cv = folds)
    meanScore = np.average(cross_score)
    print("Training Score: ", meanScore)
    
    clf.fit(X, y)
    print('Best score: {}'.format(clf.best_score_))
    print('Best parameters: {}'.format(clf.best_params_))
    print("STOP")
    return clf.best_params_

dictionary = mlp_param_selection(X_train, y_train)
activation = dictionary['activation']
solver = dictionary['solver']
learning_rate = dictionary['learning_rate']
alpha = dictionary['alpha']
hidden_layer_sizes = dictionary['hidden_layer_sizes']
max_iter = dictionary['max_iter']

clf1 = MLPClassifier(activation = activation, solver = solver, learning_rate = learning_rate, max_iter = max_iter, alpha = alpha, hidden_layer_sizes = hidden_layer_sizes, random_state = 42)
clf1.fit(X_train,y_train)
y_predict = clf1.predict(X_test)
error = rmsle(y_test, y_predict)
print ("MLPClassifier Error:", error)