In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score,recall_score,precision_score, mean_absolute_error,mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.linear_model import LinearRegression

In [4]:
plt.rc('font',size=6)
SEED=1234

In [5]:
data_file = "data\\train.csv"
test_file = "data\\test.csv"
sub_file = "data\\sub.csv"

In [None]:
housing_data = pd.read_csv(data_file)
display(housing_data.head())
display(housing_data.describe())

y=housing_data["SalePrice"]
#housing_data.drop("SalePrice")
features = housing_data.columns
num_features = len(features)-2
#housing_data = housing_data.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('.'))


print (f"Num features in dataset: {num_features}")
print(features)
cats = ['MSZoning', 'Street','Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
       'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical',  'KitchenQual',
        'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive',  'PoolQC',
       'Fence', 'MiscFeature', 'SaleType',
       'SaleCondition']

non_cats= [c for c in features if c not in cats]
print(non_cats)

In [None]:
rows=11
cols=8
fig,ax = plt.subplots(rows,cols,figsize=(30, 32))
i=0
j=0
for f in features:
    print(f"Plotting feature {f}")
   
    # if f=="SalePrice":
    #     continue 
    # if f in cats:
    #     housing_data=housing_data.astype({f:'category'})
    
    x=housing_data[f]
    ax[i,j].scatter(x[x.notna()],y[x.notna()])
    ax[i,j].set_title(f)
    j=j+1
    if j==cols:
        j=0
        i=i+1
plt.show()



In [None]:
corr=housing_data.apply(lambda x : pd.factorize(x)[0]).corr()
mask = (abs(corr) > 0.5) & (abs(corr) != 1)
corr.where(mask).stack().sort_values()


In [None]:
pca_2 = PCA(n_components=3)
pca_2
pca_2.fit(housing_data.apply(lambda x : pd.factorize(x)[0]))
pca_2.explained_variance_ratio_

In [10]:
x_trans = pca_2.transform(housing_data.apply(lambda x : pd.factorize(x)[0]))


In [None]:
ax = plt.axes(projection ='3d')

ax.scatter(housing_data["LotArea"],housing_data["LotFrontage"],y)
plt.show()

In [None]:
## NEURAL NETWORK

# One hot encoding
display(housing_data.head())


onehot_data = pd.get_dummies(data = housing_data,
                        prefix = cats,
                        columns = cats)
onehot_data= onehot_data.drop(columns=["SalePrice"],axis=1)
for f in non_cats:
    c = housing_data[f].corr(y)
    CORRPAR="~~WEAK~~"
    if abs(c) >0.5:
        CORRPAR = "++HIGH++"
    elif abs(c)<0.2:
        CORRPAR = "--LOW--"
        print (f"Dropping Feature {f}")
        onehot_data.drop(f,axis=1)
        
    print (f"Correlation between {f} and Sale price is {c} - {CORRPAR}")
# Initialize the class
#print(onehot_data["SalePrice"])



    

In [13]:
# Scikit Linear Regression Gradient Descent model
def run_reg(X_train,y_train,X_cv,y_cv,plot):
    linear_model = LinearRegression()

    # Train the model
    linear_model.fit(X_train, y_train )
    yhat_train = linear_model.predict(X_train) 
    train_mse = mean_squared_error(y_train, yhat_train) / 2
    yhat_cv = linear_model.predict(X_cv)
    cv_mse = mean_squared_error(y_cv, yhat_cv) / 2
    print(f"Training Error: {train_mse} Validation Error {cv_mse}")
    
    if (plot):
        plt.scatter(X_train[:,0],y_train)
        plt.scatter(X_train[:,0],yhat_train,color="r")
        plt.show()
        plt.scatter(X_cv[:,0],y_cv)
        plt.scatter(X_cv[:,0],yhat_cv,color="r")
        plt.show()
    return train_mse,cv_mse

In [None]:
fd_cv_err_list=[]
onehot_data = onehot_data.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('.'))
print(f"Total data size: {onehot_data.shape[0]}")
print(onehot_data.columns)
pre_drop_f = ["2ndFlrSF","1stFlrSF","LowQualFinSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath"] #Since Dropping these reduces error significantly
dropped_features=[]
for f in features:
    df=onehot_data
    df=df.drop(pre_drop_f,axis=1)
    if f in df.columns:
        print(f"Dropping feature {f}")
        df=df.drop(f,axis=1)
        dropped_features.append(f)
    else: 
        continue

    X_train,X_cv,y_train,y_cv=train_test_split(df,y,train_size=0.70,random_state = SEED)


    y_train=np.asarray(y_train)
    X_train=np.asarray(X_train)
    y_cv=np.asarray(y_cv)
    X_cv=np.asarray(X_cv)
    scaler_linear = StandardScaler()
    # Compute the mean and standard deviation of the training set then transform it
    X_train_scaled=scaler_linear.fit_transform(X_train)
    X_cv_scaled=scaler_linear.transform(X_cv)
    #print(scaled_housing_data)    
    print(f"Training data size: {X_train_scaled.shape}")
    print(f"Validtion data size: {X_cv_scaled.shape}")

    print(f"Gradient Descent - Linear Regression")
    print("="*50)
    mse_err,cv_err = run_reg(X_train,y_train,X_cv,y_cv,plot=False)
    fd_cv_err_list.append(cv_err)

min_i = np.argmin(fd_cv_err_list)
print(f"Dropping feature {dropped_features[i]} results  min Validation error {fd_cv_err_list[min_i]}")


In [None]:
#Polynomial Features
print(f"Total data size: {onehot_data.shape[0]}")
print(onehot_data.columns)
onehot_data = onehot_data.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('.'))

## Nothing reduces error significantly here. Trying options based on intuition and Linear Regression results
#pre_drop_f = ["Id","2ndFlrSF","1stFlrSF","LowQualFinSF"]#,"GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath"] #
#for f in features:
df=onehot_data
#    df=df.drop(pre_drop_f,axis=1)
# if f in df.columns:
#     print(f"Dropping feature {f}")
#     df=df.drop(f,axis=1)
#     dropped_features.append(f)
# else: 
#     continue

X_train,X_cv,y_train,y_cv=train_test_split(df,y,train_size=0.70,random_state = SEED)
y_train=np.asarray(y_train)
X_train=np.asarray(X_train)
y_cv=np.asarray(y_cv)
X_cv=np.asarray(X_cv)
poly = PolynomialFeatures(degree=2)

# Compute the number of features and transform the training set
X_train_mapped = poly.fit_transform(X_train)

scaler_poly = StandardScaler()
# Compute the mean and standard deviation of the training set then transform it
X_train_mapped_scaled = scaler_poly.fit_transform(X_train_mapped)

X_cv_mapped = poly.transform(X_cv)
# Scale the cross validation set using the mean and standard deviation of the training set
X_cv_mapped_scaled = scaler_poly.transform(X_cv_mapped)
print(f"Gradient Descent - Second order Poly Regression")
print("="*50)
mse_err,cv_err = run_reg(X_train_mapped_scaled,y_train,X_cv_mapped_scaled,y_cv,plot=False)

In [None]:
# Initialize lists to save the errors, models, and feature transforms
train_mses = []
cv_mses = []
models = []
polys = []
scalers = []
X_train,X_cv,y_train,y_cv=train_test_split(df,y,train_size=0.70,random_state = SEED)
# y_train=np.asarray(y_train)
# X_train=np.asarray(X_train)
# y_cv=np.asarray(y_cv)
# X_cv=np.asarray(X_cv)
# Loop over 10 times. Each adding one more degree of polynomial higher than the last.
for degree in range(1,3):#11:  - Unfortunately runs out of memory on my computer for 3rd degree poly
    
    # Add polynomial features to the training set
    poly = PolynomialFeatures(degree, include_bias=False)
    X_train_mapped = poly.fit_transform(X_train)
    polys.append(poly)
    
    # Scale the training set
    scaler_poly = StandardScaler()
    X_train_mapped_scaled = scaler_poly.fit_transform(X_train_mapped)
    scalers.append(scaler_poly)
    X_cv_mapped = poly.transform(X_cv)
    X_cv_mapped_scaled = scaler_poly.transform(X_cv_mapped)

    # Create and train the model
    print(f"Gradient Descent - {degree} order Poly Regression")

    train_mse,cv_mse = run_reg(X_train_mapped_scaled,y_train,X_cv_mapped_scaled,y_cv,plot=True)
    train_mses.append(train_mse)
    cv_mses.append(cv_mse)
    
# Plot the results
degrees=range(1,11)
#plt.title("degree of polynomial vs. train and CV MSEs")
#plt.scatter(degrees, train_mses)
#plt.scatter(degrees, cv_mses)
#plt.show()

In [17]:
def run_test(poly,scaler_poly,model):
    test_data = pd.read_csv(test_file)
    display(test_data.head())
    
    display(test_data.describe())

    test_onehot_data = pd.get_dummies(data = test_data,
                        prefix = cats,
                        columns = cats)
    
    drop_fs= [c for c in test_onehot_data.columns if c not in onehot_data.columns and c in test_onehot_data.columns]
    print(f"Columns to drop = {drop_fs}")
    test_onehot_data=test_onehot_data.drop(columns=drop_fs,axis=1)
    for c in onehot_data.columns:
        if c not in test_onehot_data:
            if onehot_data[c].dtype.kind in 'biufc':
                test_onehot_data.insert(onehot_data.columns.get_loc(c), c,0)
            else:
                test_onehot_data.insert(onehot_data.columns.get_loc(c), c,'.')
    print(f"Train Data Columns: {onehot_data.columns}")
    print(f"Test Data Columns: {test_onehot_data.columns}")
    test_onehot_data = test_onehot_data.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('.'))
    display(test_data.describe())

    #test_onehot_data=test_onehot_data.drop(columns=["Id"],axis=1)
    #test_onehot_data=np.asarray(test_onehot_data)
    #print(test_onehot_data.shape)
    X_test_mapped = poly.transform(test_onehot_data)
    X_test_mapped_scaled = scaler_poly.transform(X_test_mapped)  
    y_pred = model.predict(X_test_mapped_scaled) 
    print(y_pred.shape)
    display(test_data.describe())

    test_data.insert(len(test_data.columns),"SalePrice",y_pred)
    test_data.to_csv(sub_file,columns=["Id","SalePrice"],index=False)
    return (y_pred,test_data)





In [None]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_mapped = poly.fit_transform(X_train)
scaler_poly = StandardScaler()
X_train_mapped_scaled = scaler_poly.fit_transform(X_train_mapped)
# Scale the cross validation set using the mean and standard deviation of the training set
model = LinearRegression()
# Train the model
model.fit(X_train_mapped_scaled, y_train)


In [None]:
y_pred,t = run_test(poly,scaler_poly,model)


In [None]:
#Define Model
tf.random.set_seed(SEED)  # applied to achieve consistent results
scaler_linear = StandardScaler()
# Compute the mean and standard deviation of the training set then transform it
X_train_scaled=scaler_linear.fit_transform(X_train)
X_cv_scaled=scaler_linear.transform(X_cv)
n=X_train.shape[1]
model = Sequential(
    [
        tf.keras.Input(shape=(n,)),
        Dense(256, activation="relu", name="layer1"),#kernel_regularizer=tf.keras.regularizers.l2(l2=0.01)),
        Dense(128, activation="relu", name="layer2"),#kernel_regularizer=tf.keras.regularizers.l2(l2=0.01)),
        Dense(64, activation="relu", name="layer3"),#kernel_regularizer=tf.keras.regularizers.l2(l2=0.01)),
        Dense(32, activation="relu", name="layer4"),#kernel_regularizer=tf.keras.regularizers.l2(l2=0.01)),

        Dense(1,name="layer5"),
    ]
)
W1, b1 = model.get_layer("layer1").get_weights()
W5, b5 = model.get_layer("layer5").get_weights()
print(f"W1{W1.shape}:\n", W1, f"\nb1{b1.shape}:", b1)
print(f"W2{W5.shape}:\n", W5, f"\nb2{b5.shape}:", b5)

model.summary()

In [21]:
## Linear Regression NN
train_mses = []
cv_mses = []
models=[]
learning_rates =[0.01,0.001,0.0001,0.00001,0.000001]
epochs = [300,300, 3000,30000,300000]
X_train,X_cv,y_train,y_cv=train_test_split(onehot_data,y,train_size=0.70,random_state = SEED)
for lr,ep in zip(learning_rates,epochs):
     print(f"X_train Shape {X_train.shape} Y_train Shape {y_train.shape}")
     print(f"Running with LR: {lr} and {ep} epochs")


     model.compile(
     loss="mse",
     optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
     )
     model.fit(
          X_train_scaled, y_train,
          epochs=ep,
          verbose=1
     )
     models.append(model)
     yhat_train = model.predict(X_train_scaled) 
     train_mse = mean_squared_error(y_train, yhat_train) / 2
     yhat_cv = model.predict(X_cv_scaled)
     cv_mse = mean_squared_error(y_cv, yhat_cv) / 2
     train_mses.append(train_mse)
     cv_mses.append(cv_mse)
     print(f"X_train Shape {X_train.shape} Yhat_train Shape {yhat_train.shape}, features: {n}")
     print(f"Training Error: {train_mse} Validation Error {cv_mse}, features: {n}")

     plt.scatter(X_train_scaled[:,0],y_train)
     plt.scatter(X_train_scaled[:,0],yhat_train,color="r")
     plt.show()
best_i = np.argmin(cv_mse)
best_model = models[i]
print(f"Identified best LR as {learning_rates[i]} with error {cv_mses[i]}")

KeyboardInterrupt: 

In [None]:
test_data = pd.read_csv(test_file)
test_onehot_data = pd.get_dummies(data = test_data,
                    prefix = cats,
                    columns = cats)

drop_fs= [c for c in test_onehot_data.columns if c not in onehot_data.columns and c in test_onehot_data.columns]
test_onehot_data=test_onehot_data.drop(columns=drop_fs,axis=1)
for c in onehot_data.columns:
    if c not in test_onehot_data:
        if onehot_data[c].dtype.kind in 'biufc':
            test_onehot_data.insert(onehot_data.columns.get_loc(c), c,0)
        else:
            test_onehot_data.insert(onehot_data.columns.get_loc(c), c,'.')
test_onehot_data = test_onehot_data.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna('.'))

X_test_scaled = scaler_linear.transform(test_onehot_data)
y_pred = model.predict(X_test_scaled)

test_data.insert(len(test_data.columns),"SalePrice",y_pred)
test_data.to_csv(sub_file,columns=["Id","SalePrice"],index=False)



