
# Feature engineering
#### Outlier Analysis

In [63]:
import pandas as pd

total_data = pd.read_csv("../data/interim/diabetes_data.csv")
total_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,33.6,0.627,50,1
1,1,85,66,26.6,0.351,31,0
2,8,183,64,23.3,0.672,32,1
3,1,89,66,28.1,0.167,21,0
4,0,137,40,43.1,2.288,33,1


As we saw in the previous step, the variables Glucose, BloodPressure and BMI seem to have zero-valued outliers. Let's analyse them.

In [64]:
print(total_data[total_data['Glucose'] == 0].shape)
print(total_data[total_data['BloodPressure'] == 0].shape)
print(total_data[total_data['BMI'] == 0].shape)

(5, 7)
(35, 7)
(11, 7)


Overall, they don't seem to be too many, so we can probably remove them without affecting the predictions too much.

In [65]:
total_data = total_data[total_data['Glucose'] != 0]
total_data = total_data[total_data['BloodPressure'] != 0]
total_data = total_data[total_data['BMI'] != 0]
total_data.shape

(724, 7)

Let's have a general look at the dataset's indicators:

In [66]:
total_data.head()
total_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0
mean,3.866022,121.882597,72.400552,32.467127,0.474765,33.350829,0.343923
std,3.362803,30.75003,12.37987,6.888941,0.332315,11.765393,0.475344
min,0.0,44.0,24.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,27.5,0.245,24.0,0.0
50%,3.0,117.0,72.0,32.4,0.379,29.0,0.0
75%,6.0,142.0,80.0,36.6,0.6275,41.0,1.0
max,17.0,199.0,122.0,67.1,2.42,81.0,1.0


Looks solid overall, so let's proceed with the next step.

#### Feature scaling
Since we're not gonna train a linear model we don't need to scale the features

### Feature Selection
Let's select the most relevant features and split the dataset.

In [67]:
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split

# We divide the dataset into training and test samples.
X = total_data.drop("Outcome", axis = 1)
y = total_data["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

selection_model = SelectKBest(chi2, k = 6)
selection_model.fit(X_train, y_train)
ix = selection_model.get_support()
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns = X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns = X_test.columns.values[ix])

X_train_sel.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age
0,0.0,137.0,84.0,27.3,0.231,59.0
1,3.0,78.0,50.0,31.0,0.248,26.0
2,1.0,82.0,64.0,21.2,0.415,23.0
3,13.0,104.0,72.0,31.2,0.465,38.0
4,1.0,97.0,70.0,18.2,0.147,21.0


In [68]:
X_train_sel["Outcome"] = list(y_train)
X_test_sel["Outcome"] = list(y_test)

X_train_sel.to_csv("../data/processed/clean_train.csv", index = False)
X_test_sel.to_csv("../data/processed/clean_test.csv", index = False)