In [1]:
# read txt file "pima-indians-diabetes.txt" from data folder into variable data as dataframe
import pandas as pd
data = pd.read_csv("../../data/pima-indians-diabetes.txt", header=None)

# assign column names to the variable data
data.columns = ["pregnant", "plasma_glucose", "blood_pressure", "triceps", "insulin", "bmi", "diabetes_pedigree", "age", "class"]


In [None]:
# create a correlation matrix
corr_matrix = data.corr()

# visualize correlation matrix
import seaborn as sns
sns.heatmap(corr_matrix, annot=True)

In [None]:
# create a scatter plot of "plasma_glucose" and "blood_pressure"
sns.pairplot(data, x_vars=["plasma_glucose"], y_vars=["blood_pressure"])

In [5]:
# split data in train and test set
from sklearn.model_selection import train_test_split
X = data.drop("class", axis=1)
y = data["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
# print number of 0 and 1 in class variable
print(y_train.value_counts())

In [7]:
# Oversample the 1 class in train data
from sklearn.utils import resample
X_resampled, y_resampled = resample(X_train, y_train, replace=True, n_samples=350, random_state=123)

In [None]:
# train an XGBoost Classifier from sklearn with 20 iterations and 0.1 learning rate
from xgboost import XGBClassifier
model = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=20, silent=True, objective='binary:logistic')
model.fit(X_resampled, y_resampled)

In [None]:
# print the corresponding variable names with the feature importances
for i in range(len(model.feature_importances_)):
    print(X.columns[i], model.feature_importances_[i])

In [None]:
from sklearn.metrics import recall_score

accuracy = model.score(X_test, y_test)
recall = recall_score(y_test, model.predict(X_test))
print(accuracy)
print(recall)