# Preprocessing and pipelining

## Preprocessing data

**Encodding dumy variables**

In [9]:
import numpy as np

In [21]:
import pandas as pd

df = pd.read_csv('auto.csv')
df_origin = pd.get_dummies(df)
print(df_origin.head())

    mpg  displ   hp  weight  accel  size  origin_Asia  origin_Europe  \
0  18.0  250.0   88    3139   14.5  15.0            0              0   
1   9.0  304.0  193    4732   18.5  20.0            0              0   
2  36.1   91.0   60    1800   16.4  10.0            1              0   
3  18.5  250.0   98    3525   19.0  15.0            0              0   
4  34.3   97.0   78    2188   15.8  10.0            0              1   

   origin_US  
0          1  
1          1  
2          0  
3          1  
4          0  


In [22]:
df_origin =df_origin.drop('origin_Asia',axis=1)
df_origin.head()

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,1
1,9.0,304.0,193,4732,18.5,20.0,0,1
2,36.1,91.0,60,1800,16.4,10.0,0,0
3,18.5,250.0,98,3525,19.0,15.0,0,1
4,34.3,97.0,78,2188,15.8,10.0,1,0


**Linear regression with dummy variables**

Assuming we are predicting miles per gallon

In [23]:
df_origin.shape

(392, 8)

In [24]:
X=df_origin.drop('mpg',axis=1).values.reshape(-1,7)
y=df_origin['mpg'].values.reshape(-1,1)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, random_state=42)
ridge = Ridge(alpha=0.5, normalize=True).fit(X_train,
y_train)
ridge.score(X_test, y_test)

0.7190645190217895

## Handling missing data

In [26]:
df= pd.read_csv('diabetes.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      768 non-null    int64  
 4   insulin      768 non-null    int64  
 5   bmi          768 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


sometimes misssing data can appear as zeros in a dataset.

In [27]:
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


you can see first 3 lines of insulin is zero

### Dropping missing data

First make the zeros nan

In [28]:
df.insulin.replace(0,np.nan, inplace=True)
df.triceps.replace(0,np.nan,inplace=True)
df.bmi.replace(0,np.nan,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      541 non-null    float64
 4   insulin      394 non-null    float64
 5   bmi          757 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(4), int64(5)
memory usage: 54.1 KB


As seen, the missing values shows

In [12]:
#df=df.dropna()
#df.shape # (393,9)

(393, 9)

We have lost have of our data, this is a bad idea.

### Imputing missing data

Usng diabetes dataset

In [29]:
df.columns

Index(['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age', 'diabetes'],
      dtype='object')

In [31]:
df['diabetes'].unique()

array([1, 0], dtype=int64)

In [36]:
X=df.drop('diabetes',axis=1).values.reshape(-1,8)
y=df['diabetes'].values.reshape(-1,)

In [16]:
from sklearn.impute import  SimpleImputer

imp = SimpleImputer(missing_values=np.nan,strategy='mean')
imp.fit(X)
X=imp.transform(X)

### imputing within a pipeline

Imputing missing data and forming a model

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import  SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

logreg = LogisticRegression(max_iter=1000)
steps = [('imputation',imp),('logistic_regression', logreg)]

pipeline = Pipeline(steps)
X_train, X_test, y_train,y_test = train_test_split(X, y,
test_size=0.3, random_state=42)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)


0.7229437229437229

## Centering and scaling

In [44]:
# winequality-red.csv # white-wine.csv
df=pd.read_csv('white-wine.csv')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [46]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [51]:
df.nunique()

fixed acidity            68
volatile acidity        125
citric acid              87
residual sugar          310
chlorides               160
free sulfur dioxide     132
total sulfur dioxide    251
density                 890
pH                      103
sulphates                79
alcohol                 103
quality                   7
dtype: int64

We are predicting quality

In [47]:
from sklearn.preprocessing import scale
X_scaled = scale(df)

In [50]:
np.mean(df),np.std(df)

(fixed acidity             6.854788
 volatile acidity          0.278241
 citric acid               0.334192
 residual sugar            6.391415
 chlorides                 0.045772
 free sulfur dioxide      35.308085
 total sulfur dioxide    138.360657
 density                   0.994027
 pH                        3.188267
 sulphates                 0.489847
 alcohol                  10.514267
 quality                   5.877909
 dtype: float64,
 fixed acidity            0.843782
 volatile acidity         0.100784
 citric acid              0.121007
 residual sugar           5.071540
 chlorides                0.021846
 free sulfur dioxide     17.005401
 total sulfur dioxide    42.493726
 density                  0.002991
 pH                       0.150985
 sulphates                0.114114
 alcohol                  1.230495
 quality                  0.885548
 dtype: float64)

In [49]:
np.mean(X_scaled), np.std(X_scaled)

(2.5192255462271553e-15, 1.0)

In [52]:
df.shape

(4898, 12)

In [55]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

X= df.drop('quality',axis=1).values.reshape(-1,11)
y=df['quality'].values.reshape(-1,)

In [56]:
from sklearn.preprocessing import StandardScaler

steps = [('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=21)
knn_scaled = pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.5642857142857143

In [59]:
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)
knn_unscaled.score(X_test, y_test)

0.47959183673469385

As seen above, unscaled is lower  than scaled

### CV and scaling in a pipeline

In [61]:
steps = [('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)
parameters = {knn__n_neighbors: np.arange(1, 50)}
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=21)
cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)


NameError: name 'knn__n_neighbors' is not defined

In [None]:
print(cv.best_params_)

In [None]:
print(cv.score(X_test, y_test))

In [None]:
print(classification_report(y_test, y_pred))