In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn import datasets
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Case 5: Imputation

Andrew Larsen  
Matthew Chinchilla  
Rikel Djoko  

## Baseline Model Development

In [4]:
# Baseline Lin Reg Model Here
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target

In [5]:
cal.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,4.526,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,3.585,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,3.521,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,3.413,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.422,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
#check for NA
cal.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [7]:
#define variable and  split data
cols = ['HouseAge','AveRooms','AveBedrms','Population','AveOccup','Latitude','Longitude']
X = cal[cols]
y = cal["MedInc"]
#split the data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.4, random_state = 0)

In [21]:
# Get the training and testing row indices As global variable to use for all cases
train_X_index = train_X.index.values.astype(int)
test_X_index = test_X.index.values.astype(int)
train_y_index = train_y.index.values.astype(int)
test_y_index = test_y.index.values.astype(int)

In [8]:
#train model using linear regression
lin_model = LinearRegression()
lin_model.fit(train_X, train_y)
coef_df = pd.DataFrame(lin_model.coef_, cols, columns = ["coef"])
coef_df

Unnamed: 0,coef
HouseAge,0.005633
AveRooms,0.363187
AveBedrms,-1.348481
Population,-1.1e-05
AveOccup,-0.003401
Latitude,-0.742928
Longitude,-0.738761


In [9]:
##predict
y_predict = lin_model.predict(test_X)

In [10]:
##perfromance evaluation using   R  square, MSE, MAE, RMSE
# r_square = r2_score(test_y, y_predict)

orig_mae = mean_absolute_error(test_y, y_predict)
orig_mse = mean_squared_error(test_y, y_predict)
# orig_rmse_val = rmse(test_y, y_predict)
orig_r2 = r2_score(test_y, y_predict)
print("MAE: %.3f"%orig_mae)
print("MSE:  %.3f"%orig_mse)
# print("RMSE:  %.3f"%orig_rmse_val)
print("R2:  %.3f"%orig_r2)

MAE: 0.684
MSE:  0.825
R2:  0.382


In [12]:
perf_frame = pd.DataFrame({'data':'original',
                   'imputation':'none',
                   'mae': orig_mae, 
                   'mse': orig_mse, 
                   'R2':orig_r2}, index=[0])
perf_frame

Unnamed: 0,data,imputation,mae,mse,R2
0,original,none,0.684403,0.824646,0.381605


In [95]:
type(cal)

pandas.core.frame.DataFrame

In [22]:
def LR_pipeline(df, data_description, imputation_type):
    
    #1 set the test and trainning set X and Y
    train_X = df.iloc[train_X_index, 1:]
    train_y = df["MedInc"].iloc[train_y_index]
    test_X = df.iloc[test_X_index, 1:].dropna()
    test_y = df["MedInc"].iloc[test_y_index]
    #2  create and fit the model using LR
    lin_model = LinearRegression()
    lin_model.fit(train_X, train_y)
    coef_df = pd.DataFrame(lin_model.coef_, cols, columns = ["coef"])
    print(coef_df)
    #3  predict using 
    y_predict = lin_model.predict(test_X)
    # 4 evaluation and performance
    orig_mae = mean_absolute_error(test_y, y_predict)
    orig_mse = mean_squared_error(test_y, y_predict)
    perf_frame = pd.DataFrame({'data':data_description,
                   'imputation_type':imputation_type,
                   'mae': orig_mae, 
                   'mse': orig_mse, 
                   'R2':orig_r2}, index=[0])
    return perf_frame
    

In [24]:
# CONFIRM IF BASELINE WORK WITH THE LR_PIPELINE
LR_pipeline(cal,"original", "None")

                coef
HouseAge    0.005633
AveRooms    0.363187
AveBedrms  -1.348481
Population -0.000011
AveOccup   -0.003401
Latitude   -0.742928
Longitude  -0.738761


Unnamed: 0,data,imputation_type,mae,mse,R2
0,original,,0.684403,0.824646,0.381605


## Missing Completely at Random

### mcar .01

In [26]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mcardf = cal
nidx = round(.01 * len(cal))
dropidxs = np.random.choice(len(cal), nidx, replace=False)
mcardf.loc[dropidxs, 'AveRooms'] = np.nan
print(mcardf.isna().sum() / len(cal))

MedInc        0.000000
HouseAge      0.000000
AveRooms      0.009981
AveBedrms     0.000000
Population    0.000000
AveOccup      0.000000
Latitude      0.000000
Longitude     0.000000
dtype: float64


In [27]:
mcardf['AveRooms'] = mcardf['AveRooms'].fillna(mcardf['AveRooms'].mean())
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [29]:
# Create Lin Reg Model Here
LR_pipeline(mcardf,"1% imputed", "MCAR")

                coef
HouseAge    0.005422
AveRooms    0.360783
AveBedrms  -1.335048
Population -0.000012
AveOccup   -0.002407
Latitude   -0.741454
Longitude  -0.737433


Unnamed: 0,data,imputation_type,mae,mse,R2
0,1% imputed,MCAR,0.686008,0.827269,0.381605


### mcar .05

In [33]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mcardf = cal
nidx = round(.05 * len(cal))
dropidxs = np.random.choice(len(cal), nidx, replace=False)
mcardf.loc[dropidxs, 'AveRooms'] = np.nan
print(mcardf.isna().sum() / len(cal))

MedInc        0.00
HouseAge      0.00
AveRooms      0.05
AveBedrms     0.00
Population    0.00
AveOccup      0.00
Latitude      0.00
Longitude     0.00
dtype: float64


In [34]:
mcardf['AveRooms'] = mcardf['AveRooms'].fillna(mcardf['AveRooms'].mean())
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [35]:
# Create Lin Reg Model Here
LR_pipeline(mcardf,"5% imputed", "MCAR")

                coef
HouseAge    0.004317
AveRooms    0.308641
AveBedrms  -1.012983
Population -0.000017
AveOccup   -0.002351
Latitude   -0.741529
Longitude  -0.740662


Unnamed: 0,data,imputation_type,mae,mse,R2
0,5% imputed,MCAR,0.6945,0.85094,0.381605


### mcar .1

In [36]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mcardf = cal
nidx = round(.1 * len(cal))
dropidxs = np.random.choice(len(cal), nidx, replace=False)
mcardf.loc[dropidxs, 'AveRooms'] = np.nan
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.1
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [37]:
mcardf['AveRooms'] = mcardf['AveRooms'].fillna(mcardf['AveRooms'].mean())
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [38]:
# Create Lin Reg Model Here
LR_pipeline(mcardf,"10% imputed", "MCAR")

                coef
HouseAge    0.003570
AveRooms    0.294629
AveBedrms  -0.961205
Population -0.000020
AveOccup   -0.003105
Latitude   -0.736278
Longitude  -0.738292


Unnamed: 0,data,imputation_type,mae,mse,R2
0,10% imputed,MCAR,0.70119,0.890913,0.381605


### mcar .2

In [39]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mcardf = cal
nidx = round(.2 * len(cal))
dropidxs = np.random.choice(len(cal), nidx, replace=False)
mcardf.loc[dropidxs, 'AveRooms'] = np.nan
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.2
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [40]:
mcardf['AveRooms'] = mcardf['AveRooms'].fillna(mcardf['AveRooms'].mean())
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [41]:
# Create Lin Reg Model Here
LR_pipeline(mcardf,"20% imputed", "MCAR")

                coef
HouseAge    0.002520
AveRooms    0.281625
AveBedrms  -0.912744
Population -0.000028
AveOccup   -0.002972
Latitude   -0.728545
Longitude  -0.730794


Unnamed: 0,data,imputation_type,mae,mse,R2
0,20% imputed,MCAR,0.714607,0.908823,0.381605


### mcar .33

In [42]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mcardf = cal
nidx = round(.33 * len(cal))
dropidxs = np.random.choice(len(cal), nidx, replace=False)
mcardf.loc[dropidxs, 'AveRooms'] = np.nan
print(mcardf.isna().sum() / len(cal))

MedInc        0.00000
HouseAge      0.00000
AveRooms      0.32999
AveBedrms     0.00000
Population    0.00000
AveOccup      0.00000
Latitude      0.00000
Longitude     0.00000
dtype: float64


In [43]:
mcardf['AveRooms'] = mcardf['AveRooms'].fillna(mcardf['AveRooms'].mean())
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [44]:
# Create Lin Reg Model Here
LR_pipeline(mcardf,"33% imputed", "MCAR")

                coef
HouseAge    0.001383
AveRooms    0.202255
AveBedrms  -0.187475
Population -0.000029
AveOccup   -0.002703
Latitude   -0.752870
Longitude  -0.761748


Unnamed: 0,data,imputation_type,mae,mse,R2
0,33% imputed,MCAR,0.737726,0.974346,0.381605


### mcar .5

In [45]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mcardf = cal
nidx = round(.5 * len(cal))
dropidxs = np.random.choice(len(cal), nidx, replace=False)
mcardf.loc[dropidxs, 'AveRooms'] = np.nan
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.5
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [46]:
mcardf['AveRooms'] = mcardf['AveRooms'].fillna(mcardf['AveRooms'].mean())
print(mcardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [47]:
# Create Lin Reg Model Here
LR_pipeline(mcardf,"50% imputed", "MCAR")

                coef
HouseAge    0.000524
AveRooms    0.192611
AveBedrms  -0.095191
Population -0.000032
AveOccup   -0.002137
Latitude   -0.744264
Longitude  -0.752982


Unnamed: 0,data,imputation_type,mae,mse,R2
0,50% imputed,MCAR,0.748826,1.000843,0.381605


## Missing at Random

### MAR .1

In [48]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mardf = cal
idxs = mardf[mardf['AveOccup'] > mardf['AveOccup'].quantile(.5)]
dropidxs = idxs.sample(frac = .2, replace = False)
mardf.loc[dropidxs.index, 'AveRooms'] = np.nan
print(mardf.isna().sum() / len(mardf))

MedInc        0.0
HouseAge      0.0
AveRooms      0.1
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [49]:
mardf['AveRooms'] = mardf['AveRooms'].fillna(mardf['AveRooms'].mean())
print(mardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [53]:
# Create Lin Reg Model Here
LR_pipeline(mardf,"10% imputed", "MAR")

                coef
HouseAge    0.002166
AveRooms    0.321427
AveBedrms  -1.157609
Population -0.000024
AveOccup   -0.003035
Latitude   -0.741311
Longitude  -0.745588


Unnamed: 0,data,imputation_type,mae,mse,R2
0,10% imputed,MAR,0.715899,0.892083,0.381605


### MAR .2

In [54]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mardf = cal
idxs = mardf[mardf['AveOccup'] > mardf['AveOccup'].quantile(.5)]
dropidxs = idxs.sample(frac = .4, replace = False)
mardf.loc[dropidxs.index, 'AveRooms'] = np.nan
print(mardf.isna().sum() / len(mardf))

MedInc        0.0
HouseAge      0.0
AveRooms      0.2
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [55]:
mardf['AveRooms'] = mardf['AveRooms'].fillna(mardf['AveRooms'].mean())
print(mardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [56]:
# Create Lin Reg Model Here
LR_pipeline(mardf,"20% imputed", "MAR")

                coef
HouseAge    0.002037
AveRooms    0.309088
AveBedrms  -1.099007
Population -0.000027
AveOccup   -0.003050
Latitude   -0.738880
Longitude  -0.741873


Unnamed: 0,data,imputation_type,mae,mse,R2
0,20% imputed,MAR,0.719138,0.891978,0.381605


### MAR .3

In [66]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mardf = cal
idxs = mardf[mardf['AveOccup'] > mardf['AveOccup'].quantile(.5)]
dropidxs = idxs.sample(frac = .6, replace = False)
mardf.loc[dropidxs.index, 'AveRooms'] = np.nan
print(mardf.isna().sum() / len(mardf))

MedInc        0.0
HouseAge      0.0
AveRooms      0.3
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [67]:
mardf['AveRooms'] = mardf['AveRooms'].fillna(mardf['AveRooms'].mean())
print(mardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [68]:
# Create Lin Reg Model Here
LR_pipeline(mardf,"30% imputed", "MAR")

                coef
HouseAge    0.000745
AveRooms    0.286899
AveBedrms  -1.010825
Population -0.000029
AveOccup   -0.002379
Latitude   -0.740167
Longitude  -0.748000


Unnamed: 0,data,imputation_type,mae,mse,R2
0,30% imputed,MAR,0.732378,0.921009,0.381605


## Missing "Not at" Random

### MNAR .25

In [69]:
california = datasets.fetch_california_housing()
cal = pd.DataFrame(california.data)
cal.columns = california.feature_names
cal['MedInc'] = california.target
cal.head()
mnardf = cal
idxs = (mnardf['AveRooms'] > mnardf['AveRooms'].quantile(.75))
mnardf.loc[idxs, 'AveRooms'] = np.nan
print(mnardf.isna().sum() / len(mnardf))

MedInc        0.000000
HouseAge      0.000000
AveRooms      0.249952
AveBedrms     0.000000
Population    0.000000
AveOccup      0.000000
Latitude      0.000000
Longitude     0.000000
dtype: float64


In [70]:
mnardf['AveRooms'] = mnardf['AveRooms'].fillna(mnardf['AveRooms'].mean())
print(mnardf.isna().sum() / len(cal))

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64


In [71]:
# Create Lin Reg Model Here
LR_pipeline(mardf,"25% imputed", "MNAR")

                coef
HouseAge    0.000745
AveRooms    0.286899
AveBedrms  -1.010825
Population -0.000029
AveOccup   -0.002379
Latitude   -0.740167
Longitude  -0.748000


Unnamed: 0,data,imputation_type,mae,mse,R2
0,25% imputed,MNAR,0.732378,0.921009,0.381605


## Conclusion

In [None]:
##put together all teh DF