In [263]:
import pandas as pd
import numpy as np
import matplotlib as plt
import plotly.express as px
import altair as alt
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression

## 1.1 Summary Statistics

In [133]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
flower_df = pd.read_csv(url, header=None, names=["Sepal Length", "Sepal Width", "Petal Length", "Petal Width","Name"])

Print the first 5 elements of the dataframe.

In [134]:
flower_df.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
flower_df.dtypes

Sepal Length    float64
Sepal Width     float64
Petal Length    float64
Petal Width     float64
Name             object
dtype: object

There are 4 numeric features:
Sepal Length: float64
Sepal Width: float64
Peal Length: float64
Petal Width: float64

In [16]:
flower_df.describe()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## 1.2 Data Visualization

Histograms # 1: Feature 1 Sepal Length

In [58]:
hist_f1 = px.histogram(flower_df.iloc[:,0], x='Sepal Length', title='Histogram of Sepal Length')
hist_f1.show()

Histograms # 2: Feature 2 Sepal Width

In [55]:
hist_f2 = px.histogram(flower_df.iloc[:,1], x='Sepal Width', title='Histogram of Sepal Width')
hist_f2.show()

Histograms # 3: Feature 3 Petal Length

In [54]:
hist_f3 = px.histogram(flower_df.iloc[:,2], x='Petal Length', title='Histogram of Petal Length')
hist_f3.show()


Histograms # 4: Feature 4 Petal Width

In [57]:
hist_f4 = px.histogram(flower_df.iloc[:,3], x='Petal Width', title='Histogram of Petal Width')
hist_f4.show()

Boxplot # 1: Feature 1 Sepal Length

In [65]:
box_f1 = px.box(flower_df.iloc[:,0], title='Sepal Length')
box_f1


Boxplot # 2: Feature 2 Sepal Width

In [66]:
box_f2 = px.box(flower_df.iloc[:,1], title='Sepal Width')
box_f2

Boxplot # 3: Feature 3 Petal Length

In [67]:
box_f3 = px.box(flower_df.iloc[:,2], title='Petal Length')
box_f3

Boxplot # 4: Feature 4 Petal Width

In [74]:
box_f4 = px.box(flower_df.iloc[:,3], title='Petal Width')
box_f4

Boxplot # 5: Combined Boxplot

In [94]:
box_combined = go.Figure()

for i in flower_df.columns:
    box_combined.add_trace(go.Box(y=flower_df[i].values, name=flower_df[i].name))
box_combined.update_layout(title="Combined Boxplot of All Features",
                          yaxis_title='measurements')
box_combined.show()


## 2.1 Imputation

In [200]:
pd.set_option("display.max_columns", 100)
url = "https://raw.githubusercontent.com/cs6220/cs6220.spring2019/master/data/AmesHousing.txt"
housing_df = pd.read_csv(url, sep="\t")

Task 1: How many features have missing values?
There are 27 features in the housing dataset contain missing values

In [201]:
housing_df.isna().any().sum()

27

Task 2: Fill each missing nominal feature value with the string “Missing”.

In [202]:
housing_df.select_dtypes(include=['object']).fillna('Missing')

Unnamed: 0,MS Zoning,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin Type 2,Heating,Heating QC,Central Air,Electrical,Kitchen Qual,Functional,Fireplace Qu,Garage Type,Garage Finish,Garage Qual,Garage Cond,Paved Drive,Pool QC,Fence,Misc Feature,Sale Type,Sale Condition
0,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,Hip,CompShg,BrkFace,Plywood,Stone,TA,TA,CBlock,TA,Gd,Gd,BLQ,Unf,GasA,Fa,Y,SBrkr,TA,Typ,Gd,Attchd,Fin,TA,TA,P,Missing,Missing,Missing,WD,Normal
1,RH,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,CBlock,TA,TA,No,Rec,LwQ,GasA,TA,Y,SBrkr,TA,Typ,Missing,Attchd,Unf,TA,TA,Y,Missing,MnPrv,Missing,WD,Normal
2,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,TA,TA,CBlock,TA,TA,No,ALQ,Unf,GasA,TA,Y,SBrkr,Gd,Typ,Missing,Attchd,Unf,TA,TA,Y,Missing,Missing,Gar2,WD,Normal
3,RL,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,Hip,CompShg,BrkFace,BrkFace,,Gd,TA,CBlock,TA,TA,No,ALQ,Unf,GasA,Ex,Y,SBrkr,Ex,Typ,TA,Attchd,Fin,TA,TA,Y,Missing,Missing,Missing,WD,Normal
4,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Gd,Y,SBrkr,TA,Typ,TA,Attchd,Fin,TA,TA,Y,Missing,MnPrv,Missing,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,RL,Pave,Missing,IR1,Lvl,AllPub,CulDSac,Gtl,Mitchel,Norm,Norm,1Fam,SLvl,Gable,CompShg,HdBoard,HdBoard,,TA,TA,CBlock,TA,TA,Av,GLQ,Unf,GasA,TA,Y,SBrkr,TA,Typ,Missing,Detchd,Unf,TA,TA,Y,Missing,GdPrv,Missing,WD,Normal
2926,RL,Pave,Missing,IR1,Low,AllPub,Inside,Mod,Mitchel,Norm,Norm,1Fam,1Story,Gable,CompShg,HdBoard,HdBoard,,TA,TA,CBlock,Gd,TA,Av,BLQ,ALQ,GasA,TA,Y,SBrkr,TA,Typ,Missing,Attchd,Unf,TA,TA,Y,Missing,MnPrv,Missing,WD,Normal
2927,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,Norm,1Fam,SFoyer,Gable,CompShg,HdBoard,Wd Shng,,TA,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,TA,Y,SBrkr,TA,Typ,Missing,Missing,Missing,Missing,Missing,Y,Missing,MnPrv,Shed,WD,Normal
2928,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Mod,Mitchel,Norm,Norm,1Fam,1Story,Gable,CompShg,HdBoard,HdBoard,,TA,TA,CBlock,Gd,TA,Av,ALQ,LwQ,GasA,Gd,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,Missing,Missing,Missing,WD,Normal


Task 3: Interpolate each missing numeric feature value using linear interpolation.

In [207]:
housing_df._get_numeric_data().interpolate() #Method is linear by default

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
0,1,526301100,20,141.0,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,1080.0,1656,0,0,1656,1.0,0.0,1,0,3,1,7,2,1960.0,2.0,528.0,210,62,0,0,0,0,0,5,2010,215000
1,2,526350040,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,882.0,896,0,0,896,0.0,0.0,1,0,2,1,5,0,1961.0,1.0,730.0,140,0,0,0,120,0,0,6,2010,105000
2,3,526351010,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,0,0,1329,0.0,0.0,1,1,3,1,6,0,1958.0,1.0,312.0,393,36,0,0,0,0,12500,6,2010,172000
3,4,526353030,20,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,2110.0,2110,0,0,2110,1.0,0.0,2,1,3,1,8,2,1968.0,2.0,522.0,0,0,0,0,0,0,0,4,2010,244000
4,5,527105010,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,928.0,928,701,0,1629,0.0,0.0,2,1,3,1,6,1,1997.0,2.0,482.0,212,34,0,0,0,0,0,3,2010,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,37.0,7937,6,6,1984,1984,0.0,819.0,0.0,184.0,1003.0,1003,0,0,1003,1.0,0.0,1,0,3,1,6,0,1984.0,2.0,588.0,120,0,0,0,0,0,0,3,2006,142500
2926,2927,923276100,20,49.5,8885,5,5,1983,1983,0.0,301.0,324.0,239.0,864.0,902,0,0,902,1.0,0.0,1,0,2,1,5,0,1983.0,2.0,484.0,164,0,0,0,0,0,0,6,2006,131000
2927,2928,923400125,85,62.0,10441,5,5,1992,1992,0.0,337.0,0.0,575.0,912.0,970,0,0,970,0.0,1.0,1,0,3,1,6,0,1979.0,0.0,0.0,80,32,0,0,0,0,700,7,2006,132000
2928,2929,924100070,20,77.0,10010,5,5,1974,1975,0.0,1071.0,123.0,195.0,1389.0,1389,0,0,1389,1.0,0.0,1,0,2,1,6,1,1975.0,2.0,418.0,240,38,0,0,0,0,0,4,2006,170000


## 2.2 Standardization

Task 1: Standardize the imputed feature data so that the values of each numeric feature
are standard normally distributed (i.e., each feature is Gaussian with zero mean and unit
variance). Note that scikit-learn may be used to perform feature standardization
(see sklearn.preprocessing.scale).

In [208]:
from sklearn.preprocessing import StandardScaler

In [209]:
housing_df._get_numeric_data().head()

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
0,1,526301100,20,141.0,31770,6,5,1960,1960,112.0,639.0,0.0,441.0,1080.0,1656,0,0,1656,1.0,0.0,1,0,3,1,7,2,1960.0,2.0,528.0,210,62,0,0,0,0,0,5,2010,215000
1,2,526350040,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,882.0,896,0,0,896,0.0,0.0,1,0,2,1,5,0,1961.0,1.0,730.0,140,0,0,0,120,0,0,6,2010,105000
2,3,526351010,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,0,0,1329,0.0,0.0,1,1,3,1,6,0,1958.0,1.0,312.0,393,36,0,0,0,0,12500,6,2010,172000
3,4,526353030,20,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,1045.0,2110.0,2110,0,0,2110,1.0,0.0,2,1,3,1,8,2,1968.0,2.0,522.0,0,0,0,0,0,0,0,4,2010,244000
4,5,527105010,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,928.0,928,701,0,1629,0.0,0.0,2,1,3,1,6,1,1997.0,2.0,482.0,212,34,0,0,0,0,0,3,2010,189900


In [238]:

before_scaling_df = housing_df._get_numeric_data()
#Drop columns that are irrelevant numeric features and the class Sale Price as the dependent variable in numeric datatype
before_scaling_df = before_scaling_df.drop(['Order', 'PID', 'MS SubClass'], axis=1).dropna()
scaler = StandardScaler()
scaler.fit(before_scaling_df)
after_scaling_array = scaler.transform(before_scaling_df)
after_scaling_df = pd.DataFrame(after_scaling_array, columns=before_scaling_df.columns)

Task 2: Visualize the results using box plots.



How do the plots differ from box plots made before feature standardization? \
Answer: \
The main differences are shown in the distribution of graphs and unstandardized mean. Before scaling, the plot could \
only demonstrate Lot Area and Misc Val as the most obvious features as opposed to the after standardization graph  with most features visible, it transforms a highly skewed graph into a Gaussian distribution. The standardization also changes the means into zero mean and unit variance.

Which feature has the outlier furthest from the mean before and after standardization?\
Answer: \
Before: Lot Area has the farthest outlier among other features. SalePrice is a class with the farthest outlier \
After: Misc Val (Dollar value of miscellaneous feature)



In [239]:
#Boxplot before scaling
box_before = go.Figure()
for i in before_scaling_df.columns:
    box_before.add_trace(go.Box(y=before_scaling_df[i].values, name=before_scaling_df[i].name))
box_before.update_layout(title="Boxplot of Numeric Features Before Scaling",
                          yaxis_title='measurements')
box_before.show()

In [217]:
#Boxplot After scaling
box_after = go.Figure()
for i in after_scaling_df.columns:
    box_after.add_trace(go.Box(y=after_scaling_df[i].values, name=after_scaling_df[i].name))
box_after.update_layout(title="Boxplot of Numeric Features After Scaling",
                          yaxis_title='measurements')
box_after.show()

## 2.3 Feature Selection

In [240]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest

Task 1: To get an idea of their relative importance, estimate the mutual information be-
tween the numeric features and the class column, ‘SalePrice’. Note that scikit-learn
may be used to estimate mutual information (see sklearn.feature selection).

In [218]:
after_scaling_df

Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,TotRms AbvGrd,Fireplaces,Garage Yr Blt,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
0,3.072506,2.744381,-0.067254,-0.506718,-0.375537,-1.163488,0.056417,0.431097,-0.293973,-0.269134,0.064433,1.267003,-0.783185,-0.101005,0.309265,1.083694,-0.249311,-1.024793,-0.755203,0.176094,-0.207291,0.354167,2.162180,-0.710413,0.306647,0.256641,0.920121,0.214409,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-0.448057,1.678499,0.428229
1,0.461265,0.187097,-0.776079,0.393091,-0.342468,-1.115542,-0.568996,0.055696,0.557395,-0.658284,-0.385015,-0.672643,-0.783185,-0.101005,-1.194427,-0.822046,-0.249311,-1.024793,-0.755203,-1.032234,-0.207291,-0.917535,-0.925143,-0.671234,-1.008387,1.196133,0.366061,-0.704493,-0.358838,-0.103134,1.854530,-0.063031,-0.089422,-0.079602,1.678499,-0.948957
2,0.504073,0.522814,-0.067254,0.393091,-0.441674,-1.259380,0.034081,1.054570,-0.293973,-0.348784,0.629649,0.432445,-0.783185,-0.101005,-0.337718,-0.822046,-0.249311,-1.024793,1.234675,0.176094,-0.207291,-0.281684,-0.925143,-0.788771,-1.008387,-0.747965,2.368594,-0.170937,-0.358838,-0.103134,-0.285354,-0.063031,21.985725,-0.079602,1.678499,-0.110125
3,1.017759,0.128458,0.641571,-0.506718,-0.110988,-0.779919,-0.568996,1.366306,-0.293973,1.105408,2.402474,2.425687,-0.783185,-0.101005,1.207523,1.083694,-0.249311,0.784028,1.234675,0.176094,-0.207291,0.990018,2.162180,-0.396980,0.306647,0.228735,-0.742060,-0.704493,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-0.816513,1.678499,0.791305
4,0.204422,0.467348,-0.776079,-0.506718,0.848000,0.658466,-0.568996,0.764787,-0.293973,-0.960956,-0.280598,-0.590974,0.853432,-0.101005,0.255844,-0.822046,-0.249311,0.784028,1.234675,0.176094,-0.207291,-0.281684,0.618518,0.739214,0.306647,0.042697,0.935952,-0.200579,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-1.184969,1.678499,0.113980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,-1.379445,-0.280621,-0.067254,0.393091,0.418109,-0.012780,-0.568996,0.826256,-0.293973,-0.853997,-0.110352,-0.399561,-0.783185,-0.101005,-0.982723,1.083694,-0.249311,-1.024793,-0.755203,0.176094,-0.207291,-0.281684,-0.925143,0.229886,0.306647,0.535698,0.207758,-0.704493,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-1.184969,-1.360118,-0.479462
2926,,-0.160296,-0.776079,-0.506718,0.385040,-0.060726,-0.568996,-0.310923,1.621605,-0.728832,-0.425874,-0.657330,-0.783185,-0.101005,-1.182556,1.083694,-0.249311,-1.024793,-0.755203,-1.032234,-0.207291,-0.917535,-0.925143,0.190707,0.306647,0.051999,0.556024,-0.704493,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-0.079602,-1.360118,-0.623440
2927,-0.309265,0.037199,-0.776079,-0.506718,0.682657,0.370789,-0.568996,-0.231891,-0.293973,0.035814,-0.316917,-0.483783,-0.783185,-0.101005,-1.048015,-0.822046,3.828799,-1.024793,-0.755203,0.176094,-0.207291,-0.281684,-0.925143,,-2.323422,-2.199061,-0.108848,-0.230221,-0.358838,-0.103134,-0.285354,-0.063031,1.146786,0.288854,-1.360118,-0.610920
2928,0.332844,-0.017506,-0.776079,-0.506718,0.087423,-0.444296,-0.568996,1.379478,0.433237,-0.828964,0.765845,0.585575,-0.783185,-0.101005,-0.219006,1.083694,-0.249311,-1.024793,-0.755203,-1.032234,-0.207291,-0.281684,0.618518,-0.122726,0.306647,-0.254964,1.157576,-0.141295,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-0.816513,-1.360118,-0.135165


In [242]:
X = after_scaling_df.iloc[:,:-1]
y = before_scaling_df.iloc[:, -1]
mutual_information = mutual_info_regression(X, y)
mutual_information

array([0.25198244, 0.17619043, 0.60260425, 0.13718395, 0.43458858,
       0.31506384, 0.19082996, 0.20496058, 0.01391504, 0.13752168,
       0.41229992, 0.38691597, 0.21545399, 0.01069907, 0.50897406,
       0.03356029, 0.01053562, 0.29623006, 0.03039677, 0.05724079,
       0.01329677, 0.20327649, 0.16440776, 0.3285628 , 0.37591292,
       0.40280877, 0.11590142, 0.1875917 , 0.05193011, 0.        ,
       0.00527383, 0.        , 0.00483971, 0.01135581, 0.0062628 ])

Task 2: What are the top 5 numeric features ranked by mutual information? Note that
features with a higher estimated mutual information are considered more informative.

The top 5 features selected are: 
1. Overall Qual
2. Year Built
3. Total Bsmt SF
4. Gr Liv Area
5. Garage Area

In [245]:
#Select the top 5 features
selector = SelectKBest(mutual_info_regression, k=5)
selector.fit(X,y)

idx_selected = selector.get_support(indices=True)

t5_features_df = X.iloc[:, idx_selected]

print(t5_features_df.columns)



Index(['Overall Qual', 'Year Built', 'Total Bsmt SF', 'Gr Liv Area',
       'Garage Area'],
      dtype='object')


Task 3: How do you expect the values for the top-ranked feature to affect the sales price
(i.e., would you expect the sales price to increase when its values go up or down)? Why?


Answer: These five features have positive correlations with the SalePrice that indicates if its values go up the sales price will also go up.

In [266]:
for i in idx_selected:
    print(mutual_information[i])
    

0.6026042492641923
0.43458858308096104
0.41229991865131765
0.5089740581044637
0.402808766469235
