# Missing Values and Imputations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)

In [None]:
df = pd.read_csv("diabetes.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      768 non-null    int64  
 4   insulin      768 non-null    int64  
 5   bmi          768 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### The Insulin, Triceps, BMI values cannot be zero, i.e. these values are actually missing. Let us replace 0s with NaN for correct analysis.

In [None]:
df.insulin.replace(0, np.nan, inplace=True)
df.triceps.replace(0, np.nan, inplace=True)
df.bmi.replace(0, np.nan, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      541 non-null    float64
 4   insulin      394 non-null    float64
 5   bmi          757 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(4), int64(5)
memory usage: 54.1 KB


# Dropping NA - Easy way but not recommended.

In [None]:
df1 = df.dropna()
df1.shape

(393, 9)

# Imputing Missing Data (here by mean)

In [None]:
X = df.drop('diabetes', axis=1)
y = df[['diabetes']]

X.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
0,6,148,72,35.0,,33.6,0.627,50
1,1,85,66,29.0,,26.6,0.351,31
2,8,183,64,,,23.3,0.672,32
3,1,89,66,23.0,94.0,28.1,0.167,21
4,0,137,40,35.0,168.0,43.1,2.288,33


In [None]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean') # one can also replace by median by mentioning it in strategy
imp.fit(X)

X = imp.transform(X)



print(X[0:5])


# X["triceps"] = df["triceps"].replace(np.NaN, df["triceps"].mean())
# One can also use the above line for replacing missing with mean, but we have to write the same line of code for each column.

# For categorical columns, one can replace with mode or some "Unknown" values.

[[  6.    148.     72.     35.    155.548  33.6     0.627  50.   ]
 [  1.     85.     66.     29.    155.548  26.6     0.351  31.   ]
 [  8.    183.     64.     29.153 155.548  23.3     0.672  32.   ]
 [  1.     89.     66.     23.     94.     28.1     0.167  21.   ]
 [  0.    137.     40.     35.    168.     43.1     2.288  33.   ]]


In [None]:
X = pd.DataFrame(X, columns = df.columns[0:-1])
X.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


# Imputing and Building Basic Logistic Regression Model with Pipeline 



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

logreg = LogisticRegression()

steps = [('imputation', imp), ('logistic_regression', logreg)]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

pipeline.fit(X_train.values, y_train.values)

y_pred = pipeline.predict(X_test)
print("Accuracy Score:", pipeline.score(X_test, y_test))

Accuracy Score: 0.7402597402597403


Misssing values can be handled by Number of ways other than imputing Mean, Median. A proper relationship between the variables having missing values with other variables must be formed so as to get this missing values filled right.

# Categorical Variables

In [None]:
df = pd.read_csv("Auto.csv")
df.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


# Creating Dummy Variables for Categorical Column-Origin: One-Hot Encoding

In [None]:
df_origin = pd.get_dummies(df)
print(df_origin)

      mpg  displ   hp  weight  ...  size  origin_Asia  origin_Europe  origin_US
0    18.0  250.0   88    3139  ...  15.0            0              0          1
1     9.0  304.0  193    4732  ...  20.0            0              0          1
2    36.1   91.0   60    1800  ...  10.0            1              0          0
3    18.5  250.0   98    3525  ...  15.0            0              0          1
4    34.3   97.0   78    2188  ...  10.0            0              1          0
..    ...    ...  ...     ...  ...   ...          ...            ...        ...
387  18.0  250.0   88    3021  ...  15.0            0              0          1
388  27.0  151.0   90    2950  ...  10.0            0              0          1
389  29.5   98.0   68    2135  ...  10.0            1              0          0
390  17.5  250.0  110    3520  ...  15.0            0              0          1
391  25.1  140.0   88    2720  ...  10.0            0              0          1

[392 rows x 9 columns]


In [None]:
df_origin = df_origin.drop('origin_Asia', axis=1)
df_origin.head()

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,1
1,9.0,304.0,193,4732,18.5,20.0,0,1
2,36.1,91.0,60,1800,16.4,10.0,0,0
3,18.5,250.0,98,3525,19.0,15.0,0,1
4,34.3,97.0,78,2188,15.8,10.0,1,0


One can also use Label Encoder in place of One Hot Encode, but One hot encoding is recommended.

In [None]:
# Building a Basic Ridge Regression model.

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

X = df_origin.drop('mpg', axis=1)
y = df_origin[['mpg']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

ridge = Ridge(alpha=0.5, normalize=True).fit(X_train, y_train)

ridge.score(X_test, y_test)

0.7190645190217895

# Standardizing Data

In [None]:
df = pd.read_csv("white-wine.csv")
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [None]:
X = df.drop('quality', axis=1).values
y = df[['quality']].values

In [None]:
from sklearn.preprocessing import scale

x_scaled = scale(X)

In [None]:
print(np.mean(X), np.std(X))

print(np.mean(x_scaled), np.std(x_scaled))

18.432687072460002 41.54494764094571
2.7314972981668206e-15 0.9999999999999999


# Standardizing and Building Basic KNN Model with Pipeline

In [None]:
# Model with Scaled Input

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

steps = [('scalar', StandardScaler()), ('knn', KNeighborsClassifier())]

pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

knn_scaled = pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy_score(y_test, y_pred)


0.5496598639455782

In [None]:
# Model without Scaled Input

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy_score(y_test, y_pred)

0.47551020408163264

So relatively the accuracy score for Standardized input is better than that of non-standardized.<br>
<b>When should one use Standardization?</b>
<ul type="disc">
  <li>Standardization assumes that your data has a Gaussian distribution. This does not strictly have to be true, but the technique is more effective if your attribute distribution is Gaussian.</li>
  <li>Also when the variables of the dataset are of different scale and you want to bring them on a common scale for better understanding their impact on the target variable.</li>
  <li>Standardization is better to use when the algorithm or ML model makes assumption that the dataset is Gaussian. Algorithms like Linear Regression, Logistic Regression, LDA, etc make such assumption. </li>
</ul>