In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from scipy import stats
from sklearn.decomposition import PCA
import random

from mpl_toolkits.mplot3d import *
import ipywidgets as widgets
from ipywidgets import interact, interactive, Dropdown
import matplotlib.animation as animation
from IPython.display import HTML

In [2]:
#preprocessing and import
df = pd.read_csv('/Users/albert/Documents/GitHub/housing_analysis/house-data/train.csv')
df = df[ df.select_dtypes(include=np.number).columns.tolist()]
df.dropna(inplace=True)

In [3]:
def pca_dec(data, n):
  pca = PCA(n)
  X_dec = pca.fit_transform(data)
  return X_dec, pca

#Decomposing the train set:
pca_train_results, pca_train = pca_dec(df, 10)

#Creating a table with the explained variance ratio
names_pcas = [f"PCA Component {i}" for i in range(1, 11, 1)]
scree = pd.DataFrame(list(zip(names_pcas, pca_train.explained_variance_ratio_)), columns=["Component", "Explained Variance Ratio"])

df_new = pd.DataFrame({'PCA':pca_train.components_[0], 'Variable Names':list(df.columns)})
df_new = df_new.sort_values('PCA', ascending=False)

In [4]:
#Sorting the absolute values of the first principal component by magnitude
df2 = pd.DataFrame(df_new)
df2['PCA']=df2['PCA'].apply(np.absolute)
df2 = df2.sort_values('PCA', ascending=False)
df_new.head()

Unnamed: 0,PCA,Variable Names
37,0.999535,SalePrice
3,0.029625,LotArea
16,0.004449,GrLivArea
12,0.003306,TotalBsmtSF
13,0.00283,1stFlrSF


In [5]:
# identifiers for top 3 features
PCA_features = df_new["Variable Names"].tolist()[1:4]
PCA_features_top10 = df_new["Variable Names"].tolist()[1:11]
PCA_features_top20 = df_new["Variable Names"].tolist()[1:21]

In [6]:
def mean_correlation(pc_info):
    n = len(pc_info.keys())
    corr_naught = 0
    for feature in pc_info.keys():
        corr_naught += pc_info[feature][1]
    return corr_naught/n

In [7]:
def f(rank, attribute, view = True):
    measured = df["SalePrice"]
    COLORS = ["red", "blue", "green", "orange", "yellow", "gray", "cyan", "purple"]
    model = {}
    xp = np.linspace(1, df[attribute].max()+df[attribute].mean()/4, 20)
    for i in range(1, rank+1):
        data_reg = np.polyfit(df[attribute], df["SalePrice"], i)
        predicted = np.polyval(data_reg, df[attribute].tolist())
        if i == 1:
            model[i] = [data_reg,
                             predicted,
                             stats.pearsonr(predicted, measured)[0], attribute]
        
        
        elif i > 1:
            model[i] = [data_reg,
                             predicted,
                             stats.spearmanr(predicted, measured)[0], attribute]
    if view is False:
        return model
    fig = plt.figure(figsize=(14,9))
    ax = plt.axes()
    plt.scatter(df[attribute], df["SalePrice"], label = f"degree : correlation", color="black")
    for i in range(1, rank+1):
        ax.plot(xp, np.polyval(model[i][0], xp), label = f"{i} : {round(model[i][2], 5)}", color = COLORS[i-1] )
    
    ax.set_ylim(bottom=0, top = df["SalePrice"].max()+df["SalePrice"].mean()/4)
    ax.set_xlim(0, df[attribute].max()+df[attribute].mean()/4)
    ax.set_title("")
    ax.set_title(f"SalePrice vs {attribute}")
    ax.set_xlabel(f"{attribute}")
    ax.set_ylabel("SalePrice")
    ax.grid()
    ax.legend()

In [8]:
#100 iterations
#each iteration takes 3 random features and performs linear regression
#mean_correlation returns the mean correlation of these 3 random features
#average_correlation is the average of this over 100 iterations

average_correlation = 0
for i in range(100):
    ran_features = [random.choice(df.keys()) for i in range(3)]
    pc_info = {i:[] for i in ran_features}
    for i in ran_features:
        model = f(1, i, view = False)
        pc_info[i].append(1)
        #model[1][2] is the correlation coefficient
        pc_info[i].append(model[1][2])
    average_correlation += mean_correlation(pc_info)
print(average_correlation/100)

0.3309736152023354


In [9]:
pc_info = {i:[] for i in PCA_features}

for i in PCA_features:
    model = f(1, i, view = False)
    pc_info[i].append(1)
    #model[1][2] is the correlation coefficient
    pc_info[i].append(model[1][2])
    
print(mean_correlation(pc_info))
#mean_correlation for the PCA features are higher

0.5402426699561254


In [10]:
pc_info = {i:[] for i in PCA_features_top10}

for i in PCA_features_top10:
    model = f(1, i, view = False)
    pc_info[i].append(1)
    #model[1][2] is the correlation coefficient
    pc_info[i].append(model[1][2])
    
print(mean_correlation(pc_info))
#mean_correlation for the PCA features are still higher

0.4583848218401875


In [11]:
pc_info = {i:[] for i in PCA_features_top20}

for i in PCA_features_top20:
    model = f(1, i, view = False)
    pc_info[i].append(1)
    #model[1][2] is the correlation coefficient
    pc_info[i].append(model[1][2])
    
print(mean_correlation(pc_info))
#mean_correlation for the PCA features are still higher!!!

0.4200755628453985


In [12]:
#interactive plotter
attribute = Dropdown(options = df.keys())
@interact(rank = (1, 6), attribute = attribute)
def g(rank, attribute):
    measured = df["SalePrice"]
    COLORS = ["red", "blue", "green", "orange", "yellow", "gray", "cyan", "purple"]
    model = {}
    xp = np.linspace(1, df[attribute].max()+df[attribute].mean()/4, 20)
    for i in range(1, rank+1):
        data_reg = np.polyfit(df[attribute], df["SalePrice"], i)
        predicted = np.polyval(data_reg, df[attribute].tolist())
        if i == 1:
            model[i] = [data_reg,
                             predicted,
                             stats.pearsonr(predicted, measured)[0]]
        
        
        elif i > 1:
            model[i] = [data_reg,
                             predicted,
                             stats.spearmanr(predicted, measured)[0]]
    
    fig = plt.figure(figsize=(14,9))
    ax = plt.axes()
    plt.scatter(df[attribute], df["SalePrice"], label = f"degree : correlation", color="black")
    for i in range(1, rank+1):
        ax.plot(xp, np.polyval(model[i][0], xp), label = f"{i} : {round(model[i][2], 5)}", color = COLORS[i-1] )
    
    ax.set_ylim(bottom=0, top = df["SalePrice"].max()+df["SalePrice"].mean()/4)
    ax.set_xlim(0, df[attribute].max()+df[attribute].mean()/4)
    ax.set_title("")
    ax.set_title(f"SalePrice vs {attribute}")
    ax.set_xlabel(f"{attribute}")
    ax.set_ylabel("SalePrice")
    ax.grid()
    ax.legend()

interactive(children=(IntSlider(value=3, description='rank', max=6, min=1), Dropdown(description='attribute', …