### Importing Libraries for data exploration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

### Importing dataset

In [3]:
data = pd.read_csv('COVID19MEXICO2021.csv')

### Data exploration

In [4]:
data.head()

Unnamed: 0,FECHA_ACTUALIZACION,ID_REGISTRO,ORIGEN,SECTOR,ENTIDAD_UM,SEXO,ENTIDAD_NAC,ENTIDAD_RES,MUNICIPIO_RES,TIPO_PACIENTE,...,OTRO_CASO,TOMA_MUESTRA_LAB,RESULTADO_LAB,TOMA_MUESTRA_ANTIGENO,RESULTADO_ANTIGENO,CLASIFICACION_FINAL,MIGRANTE,PAIS_NACIONALIDAD,PAIS_ORIGEN,UCI
0,2022-08-09,z53cb3,2,12,9,2,9,9,2,1,...,1,2,97,1,2,7,99,México,97,97
1,2022-08-09,zze974,1,6,24,1,24,24,35,1,...,1,1,2,2,97,7,99,México,97,97
2,2022-08-09,zz7202,1,12,16,2,16,16,112,1,...,1,1,2,2,97,7,99,México,97,97
3,2022-08-09,z405fd,1,12,9,2,9,9,11,1,...,1,2,97,1,2,7,99,México,97,97
4,2022-08-09,z26b82,2,12,9,1,9,9,7,1,...,2,2,97,1,2,7,99,México,97,97


### Amount of rows and columns

In [5]:
print(data.shape)
print(f'The data set has {data.shape[0]} rows and {data.shape[1]} columns')

(8830345, 40)
The data set has 8830345 rows and 40 columns


### Checking for null values and duplication

In [6]:
# Checking for null vales
data.isnull().sum().sum()

0

> The data set does not have null values

In [7]:
# Checking for data duplication
data.duplicated().sum()

0

> No duplicates in the data set

### Basic Stats for data exploration

In [8]:
data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
FECHA_ACTUALIZACION,8830345.0,1.0,2022-08-09,8830345.0,,,,,,,
ID_REGISTRO,8830345.0,8830345.0,z53cb3,1.0,,,,,,,
ORIGEN,8830345.0,,,,1.794278,0.404228,1.0,2.0,2.0,2.0,2.0
SECTOR,8830345.0,,,,8.813864,3.815127,1.0,4.0,12.0,12.0,99.0
ENTIDAD_UM,8830345.0,,,,14.031895,7.621102,1.0,9.0,10.0,19.0,32.0
SEXO,8830345.0,,,,1.469519,0.49907,1.0,1.0,1.0,2.0,2.0
ENTIDAD_NAC,8830345.0,,,,15.392812,10.58106,1.0,9.0,13.0,20.0,99.0
ENTIDAD_RES,8830345.0,,,,14.330934,7.550073,1.0,9.0,12.0,19.0,32.0
MUNICIPIO_RES,8830345.0,,,,37.082698,93.162449,1.0,7.0,14.0,38.0,999.0
TIPO_PACIENTE,8830345.0,,,,1.064117,0.244961,1.0,1.0,1.0,1.0,2.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8830345 entries, 0 to 8830344
Data columns (total 40 columns):
 #   Column                 Dtype 
---  ------                 ----- 
 0   FECHA_ACTUALIZACION    object
 1   ID_REGISTRO            object
 2   ORIGEN                 int64 
 3   SECTOR                 int64 
 4   ENTIDAD_UM             int64 
 5   SEXO                   int64 
 6   ENTIDAD_NAC            int64 
 7   ENTIDAD_RES            int64 
 8   MUNICIPIO_RES          int64 
 9   TIPO_PACIENTE          int64 
 10  FECHA_INGRESO          object
 11  FECHA_SINTOMAS         object
 12  FECHA_DEF              object
 13  INTUBADO               int64 
 14  NEUMONIA               int64 
 15  EDAD                   int64 
 16  NACIONALIDAD           int64 
 17  EMBARAZO               int64 
 18  HABLA_LENGUA_INDIG     int64 
 19  INDIGENA               int64 
 20  DIABETES               int64 
 21  EPOC                   int64 
 22  ASMA                   int64 
 23  INMUSUP

### Cheking for the count of unique values for every column

In [10]:
for col in data.describe(include='all').columns:
    print(data[col].value_counts())
    print()

FECHA_ACTUALIZACION
2022-08-09    8830345
Name: count, dtype: int64

ID_REGISTRO
z53cb3     1
bc4a73     1
bf0837     1
8b7aca     1
78fa41     1
          ..
89a3d3     1
86a72a     1
8993f7     1
622f05     1
m0bfb28    1
Name: count, Length: 8830345, dtype: int64

ORIGEN
2    7013750
1    1816595
Name: count, dtype: int64

SECTOR
12    5013213
4     3121009
9      359486
6      156188
3       73658
5       42312
8       29391
10      18561
11       7740
7        3693
13       2932
1        1114
2         926
99         86
14         36
Name: count, dtype: int64

ENTIDAD_UM
9     3463344
15     541089
19     355807
11     351549
27     345109
14     288576
24     261492
17     239606
16     193728
30     190640
2      187362
5      186062
21     177616
26     171738
28     167848
22     153962
3      146602
31     144373
25     133827
12     131038
23     120656
20     111852
8      110761
7       92455
1       91095
13      87532
10      78828
18      66570
6       64581
32      637

## Understanding variables

<img src="./img/tables.1-EDIT.jpg" style="height: 500px;, align: center;"/>

<img src="./img/tables.2-EDIT.jpg" style="height: 500px;, align: center;"/>

> For more info about the data set check https://serendipia.digital/covid19-mx/guia-para-entender-los-datos-de-covid-19-de-la-secretaria-de-salud-de-mexico/ <br>
> https://datos.gob.mx/busca/dataset/informacion-referente-a-casos-covid-19-en-mexico

### Can we predict the of a patient death by COVID-19 based on a patient's comorbidities?
- We will use only patients with a **positive** COVID-19 result. 
- Nevertheless, the dataset has other variables not explained in the document. Ex. **RESULT** became 2 new variables **RESULTADO_LAB** and **RESULTADO_ANTIGENO**.
- We will use the rows where the patient has a positive result for COVID-19. This values were added in a new column named **'CLASIFICACION_FINAL'** where rows with values from **1 - 3** refer to the  patiens that were positive for COVID

### Columns will be drop due that not add information to the problem we are investigating

In [11]:
df = data.drop(columns=['FECHA_ACTUALIZACION', 'ID_REGISTRO', 'ORIGEN','SECTOR', 'ENTIDAD_UM',
                        'ENTIDAD_NAC', 'ENTIDAD_RES', 'MUNICIPIO_RES', 'TIPO_PACIENTE',
                        'FECHA_INGRESO', 'FECHA_SINTOMAS', 'OTRA_COM', 'OTRO_CASO',
                        'HABLA_LENGUA_INDIG', 'INDIGENA', 'NACIONALIDAD', 'MIGRANTE',
                        'TOMA_MUESTRA_LAB', 'RESULTADO_LAB','TOMA_MUESTRA_ANTIGENO',
                        'RESULTADO_ANTIGENO', 'PAIS_NACIONALIDAD', 'PAIS_ORIGEN', 'INTUBADO', 'UCI'])

In [12]:
df.head()

Unnamed: 0,SEXO,FECHA_DEF,NEUMONIA,EDAD,EMBARAZO,DIABETES,EPOC,ASMA,INMUSUPR,HIPERTENSION,CARDIOVASCULAR,OBESIDAD,RENAL_CRONICA,TABAQUISMO,CLASIFICACION_FINAL
0,2,9999-99-99,2,26,97,2,2,2,2,2,2,2,2,2,7
1,1,9999-99-99,99,34,2,2,2,2,2,2,2,2,2,2,7
2,2,9999-99-99,2,41,97,2,2,2,2,2,2,2,2,1,7
3,2,9999-99-99,2,25,97,2,2,2,2,2,2,2,2,2,7
4,1,9999-99-99,2,20,2,2,2,2,2,2,2,2,2,2,7


### Selecting columns where the patient had positive result for COVID-19 

<img src="./img/columns_post_f.png" style="height: 500px;, align: center;"/>

In [13]:
df_covid_post = df[df.CLASIFICACION_FINAL < 4]

In [14]:
df_covid_post = df_covid_post.drop(columns=['tabaquismo', 'embarazo', 'asma'])

KeyError: "['tabaquismo', 'embarazo', 'asma'] not found in axis"

In [None]:
df_covid_post = df_covid_post.reset_index(drop=True)

In [None]:
df_covid_post.head()

In [None]:
df_covid_post.shape

In [None]:
df_covid_post.to_csv('covid_data_mex.csv', index=False)

### Data Cleaning

> We will change the value of numerical columns that refer to a category.<br>
> If the columns have values of 97, 98, 99, we will replace that value with NO.

In [None]:
df_covid_post['SEXO'] = df_covid_post.SEXO.apply(lambda x: 'female' if x == 1 else 'male')

In [None]:
cols = ['NEUMONIA', 'EMBARAZO',
       'DIABETES', 'EPOC', 'ASMA', 'INMUSUPR', 'HIPERTENSION',
       'CARDIOVASCULAR', 'OBESIDAD', 'RENAL_CRONICA', 'TABAQUISMO']

In [None]:
for col in cols:
    df_covid_post[col] = df_covid_post[col].apply(lambda x: 'yes' if x == 1 else 'no')

In [None]:
df_covid_post.head()

## Feature Engineering

### Making a new column name 'decease' to record the patientes that died.

In [None]:
df_covid_post['decease'] = df_covid_post.FECHA_DEF.apply(lambda x: 0 if x == '9999-99-99' else 1)

In [None]:
df_covid_post.drop(columns=['FECHA_DEF', 'CLASIFICACION_FINAL'], inplace=True)

In [None]:
df_covid_post.head()

In [None]:
df_covid_post.columns = df_covid_post.columns.str.lower()

In [None]:
df_covid_post.head(3)

## EDA

In [None]:
# Making list with teh categorical variables
cat_cols = ['sexo','neumonia','embarazo', 'diabetes', 'epoc',
       'asma', 'inmusupr', 'hipertension', 'cardiovascular', 'obesidad',
       'renal_cronica', 'tabaquismo']

### Univariate stats

### Numerical variables

In [None]:
df_covid_post.decease.value_counts()

In [None]:
# Set numerical variables to a format to only 6 values after the decimal point
pd.set_option('display.float_format', '{:.6f}'.format)

In [None]:
df_covid_post.edad.describe()

> *... El rango de edad fue de 0 a 113 años...* <br>
> The age range was from 0 to 113. Meaning that any row when the age is greater than 113 will be dropped

In [None]:
df_covid_post = df_covid_post.drop(df_covid_post[df_covid_post.edad > 113].index)

In [None]:
df_covid_post.edad.describe()

In [None]:
df_covid_post.edad.hist();

### Categorical variables

In [None]:
df_covid_post.describe(exclude='number')

In [None]:
fig, ax = plt.subplots(nrows = 4, ncols = 3, figsize =(8,12))
fig.tight_layout(pad=2.0)
i = 0
for row in range(4):
    for col in range(3):
        g = sns.countplot(x=cat_cols[i], data=df_covid_post, ax = ax[row,col], palette="tab10");
        g.set_xticklabels(g.get_xticklabels(), rotation=30)
        i += 1
plt.show()

### Bivariate Charts

#### Age vs decease

In [None]:
sns.boxenplot(data=df_covid_post, x='decease', y='edad', hue='sexo');
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);

### Categorical variables vs decease

In [None]:
fig, ax = plt.subplots(nrows = 4, ncols = 3, figsize =(8,12))
fig.tight_layout(pad=2.0)
i = 0
for row in range(4):
    for col in range(3):
        g = sns.countplot(x=cat_cols[i], data=df_covid_post, ax = ax[row,col], palette="tab10", hue='decease');
        g.set_xticklabels(g.get_xticklabels(), rotation=30)
        i += 1
plt.show()

### Decease risk ratio by group

In [None]:
global_decease = df_covid_post.decease.mean()
global_decease

In [None]:
from IPython.display import display

In [None]:
for c in cat_cols:
    print(c)
    df_group = df_covid_post.groupby(c).decease.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_decease
    df_group['risk'] = df_group['mean'] / global_decease
    display(df_group)
    print()
    print()

### Feature importance: Mutual information

In [None]:
from sklearn.metrics import mutual_info_score

In [None]:
# Function to get mutal information for all columns
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_covid_post.decease)

In [None]:
mi = df_covid_post[cat_cols].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

### Hypothesis tesiting for categorical variables, Chi square

In [None]:
from scipy.stats import chi2_contingency

In [None]:
# Function to run the test in all categorical variables
def chi2_analisys(df, dep_var, columns):
    """
    Arguments are a dataframe and the name of the Dependent variable as a string
    :param dataframe:
    :param dep_var:
    :return:
    """
    data_to_add = []
    for col in columns:
        temp_crosstab = pd.crosstab(df[dep_var], df[col])
        temp_result = chi2_contingency(temp_crosstab)
        if col != dep_var:
            if temp_result[1] < 0.05:
                # print(f'Dependent Var: {dep_var} ---- Independent Var: {col} ----- p-value={round(temp_result[1], 10)} -> We will reject the NULL hypothesis.')
                data_to_add.append([dep_var, col, round(temp_result[1], 10), 'Reject'])
            else:
                # print(f'Dependent Var: {dep_var} ---- Independent Var: {col} ----- p-value={round(temp_result[1], 10)} -> We will not reject the NULL hypothesis.')
                data_to_add.append([dep_var, col, round(temp_result[1], 10), 'Accept'])
    new_df = pd.DataFrame(data_to_add, columns=['Dependent', 'Independent', 'p-value', 'NULL Hypothesis'])
    return new_df.sort_values(by='NULL Hypothesis').reset_index(drop=True)

In [None]:
result = chi2_analisys(df_covid_post, 'decease', cat_cols)
result

### Prepare dataset for modeling

In [None]:
# Using OrdinalEncoder to encode binary categorical data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Making the encoder
encoder = OrdinalEncoder()

In [None]:
df_covid_post[cat_cols] = encoder.fit_transform(df_covid_post[cat_cols])

In [None]:
scaler = MinMaxScaler()

In [None]:
df_covid_post.edad = scaler.fit_transform(df_covid_post[['edad']])
df_covid_post.head()

### Dividing the data set in X and y to check for multicoliniarity

In [None]:
X = df_covid_post.iloc[:, :-1]
y = df_covid_post.iloc[:, -1]

### Checking for multicoliniarity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Making a DataFrame to save the values
vif_df = pd.DataFrame()

vif_df['features'] = X.columns

In [None]:
vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

vif_df

> No multicoliniarity in the datasete

### Data prep for modeling

In [None]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=123)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_val.shape, y_val.shape

In [None]:
X_test.shape, y_test.shape

## Libraries for all the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, recall_score

### Logistic Regression

In [None]:
reg = LogisticRegression(solver='lbfgs', max_iter=10000, random_state=123)

In [None]:
reg.fit(X_train, y_train)

In [None]:
accuracy_score(y_val, reg.predict(X_val))

In [None]:
roc_auc_score(y_val, reg.predict_proba(X_val)[:,1])

In [None]:
reg_conf_matrix = confusion_matrix(y_val, reg.predict(X_val))
reg_conf_matrix

In [None]:
# Define class labels for binary classification
class_labels = ['No', 'Yes']

# Create a heatmap of the confusion matrix
plt.figure(figsize=(4, 3))
sns.heatmap(reg_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### Decision Tree Classifier

In [None]:
# Create the parameter for tunning the Tree Classifier
params = {
    'max_depth': [1, 2, 3, 5, 10, 15, 20, None],
    'min_samples_leaf': [1, 5, 10, 20, 50, 100, 200, 500],
    'criterion': ["gini", "entropy"]
}

In [None]:
dt = DecisionTreeClassifier(random_state=123)

In [None]:
grid_search = GridSearchCV(estimator=dt, param_grid=params, cv=5, n_jobs=-1, verbose=1, scoring = "f1")

In [None]:
%%time
grid_search.fit(X_train, y_train)

#### Best parameters

In [None]:
grid_search.best_estimator_

#### Final Tree

In [None]:
dt = DecisionTreeClassifier(random_state=123)

In [None]:
dt.fit(X_train, y_train)

In [None]:
accuracy_score(y_val, dt.predict(X_val))

In [None]:
roc_auc_score(y_val, dt.predict_proba(X_val)[:,1])

In [None]:
dt_conf_matrix = confusion_matrix(y_val, dt.predict(X_val))

In [None]:
# Define class labels for binary classification
class_labels = ['No', 'Yes']

# Create a heatmap of the confusion matrix
plt.figure(figsize=(4, 3))
sns.heatmap(dt_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

### Create and fit the ComplementNB model

In [None]:
cnb = ComplementNB()

cnb.fit(X_train, y_train)

In [None]:
# Evaluate the model
accuracy = cnb.score(X_val, y_val)
print("Accuracy:", accuracy)

In [None]:
roc_auc_score(y_val, cnb.predict_proba(X_val)[:, 1])

In [None]:
recall_score(y_val, cnb.predict(X_val))

In [None]:
cnb_conf_matrix = confusion_matrix(y_val, cnb.predict(X_val))

# Define class labels for binary classification
class_labels = ['No', 'Yes']

# Create a heatmap of the confusion matrix
plt.figure(figsize=(4, 3))
sns.heatmap(cnb_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
tn, fp, fn, tp  = confusion_matrix(y_val, cnb.predict(X_val)).ravel()

In [None]:
tn, fp, fn, tp 

### Random Forest Classifier

#### Finding the best number of estimators for the Random Forest

In [None]:
scores = []

for n in range(10, 101, 5):
    rfc = RandomForestClassifier(n_estimators=n, random_state=123)
    rfc.fit(X_train, y_train)

    y_pred = rfc.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    
    scores.append((n, auc))

In [None]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'auc'])

In [None]:
plt.plot(df_scores.n_estimators, df_scores.auc);

### Finding the optimal max_depth for the Random Forest

In [None]:
scores_m = []

for d in [5, 10, 15]:
    for n in range(20, 70, 2):
        rf = RandomForestClassifier(n_estimators=n,
                                    max_depth=d,
                                    random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)

        scores_m.append((d, n, auc))

In [None]:
columns = ['max_depth', 'n_estimators', 'auc']
df_scores_new = pd.DataFrame(scores_m, columns=columns)

In [None]:
for d in [5, 10, 15]:
    df_subset = df_scores_new[df_scores_new.max_depth == d]
    
    plt.plot(df_subset.n_estimators, df_subset.auc,
             label='max_depth=%d' % d)

plt.legend()
plt.show()

In [None]:
df_scores_new.sort_values('auc', ascending=False).head()

### Finding the optimal min_sample_leaf for the Random Forest

In [None]:
scores_min_s = []

for s in [1, 3, 5, 10, 50]:
    for n in range(20, 70, 2):
        rf = RandomForestClassifier(n_estimators=n,
                                    max_depth=10,
                                    min_samples_leaf=s,
                                    random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)

        scores_min_s.append((s, n, auc))

In [None]:
columns = ['min_samples_leaf', 'n_estimators', 'auc']
df_scores_min = pd.DataFrame(scores_min_s, columns=columns)

In [None]:
colors = ['black', 'blue', 'orange', 'red', 'grey']
values = [1, 3, 5, 10, 50]

for s, col in zip(values, colors):
    df_subset = df_scores_min[df_scores_min.min_samples_leaf == s]
    
    plt.plot(df_subset.n_estimators, df_subset.auc,
             color=col,
             label='min_samples_leaf=%d' % s)

plt.legend()
plt.show()

In [None]:
df_scores_min.sort_values('auc', ascending=False).head()

In [None]:
params_rf = {
    'n_estimators': [20,30,40,50,60,70],
    'min_samples_leaf': [1,5,10,50],
    'max_depth': [5,10,15]
}

In [None]:
rfc_cv = RandomForestClassifier(random_state=123)

In [None]:
grid_search_rf = GridSearchCV(estimator=rfc_cv, param_grid=params_rf, cv=5, n_jobs=-1, verbose=1, scoring = "roc_auc")

In [None]:
grid_search_rf.fit(X_train, y_train)

In [None]:
print(grid_search_rf.best_estimator_)
print(grid_search_rf.best_score_)

### Building final Random Forest Model

In [None]:
rfc_final = RandomForestClassifier(n_estimators=60, 
                                   max_depth=15, 
                                   min_samples_leaf=50, 
                                   random_state=123, n_jobs=-1)

In [None]:
rfc_final.fit(X_train, y_train)

In [None]:
y_pred = rfc_final.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)

In [None]:
auc

In [None]:
rfc_final.score(X_val, y_val)

In [None]:
rfc_conf_matrix = confusion_matrix(y_val, rfc_final.predict(X_val))

# Define class labels for binary classification
class_labels = ['No', 'Yes']

# Create a heatmap of the confusion matrix
plt.figure(figsize=(4, 3))
sns.heatmap(rfc_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
feature_names = X_val.columns 

In [None]:
feature_importances = rfc_final.feature_importances_

In [None]:
feature_importance_df = pd.DataFrame(
    {'Feature': feature_names, 'Importance': feature_importances}
)

In [None]:
feature_importance_df.sort_values('Importance', ascending=False)

### XGBOOST Classifier  (last model)

In [None]:
xgb_model = xgb.XGBClassifier()

#### Making a GridSearch to tune and get the best model

In [None]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 6, None],
    'learning_rate': [0.1, 0.3],
    'subsample': [0.8],
    'seed': [1],
    'objective': ['binary:logistic', 'binary:logitraw'],
    'nthread': [8],
}

In [None]:
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=4, scoring='f1')

In [None]:
# Perform the grid search to find the best model
grid_search_xgb.fit(X_train, y_train)

In [None]:
best_params = grid_search_xgb.best_params_
best_params

In [None]:
# Get the best hyperparameters and model
best_model = grid_search_xgb.best_estimator_

In [None]:
# Make predictions on the test set using the best model
y_pred_xgb = best_model.predict_proba(X_val)[:,1]

In [None]:
auc = roc_auc_score(y_val, y_pred_xgb)
auc

In [None]:
best_model.score(X_val, y_val)

In [None]:
confusion_matrix(y_val, best_model.predict(X_val))

In [None]:
xgb_conf_matrix = confusion_matrix(y_val, best_model.predict(X_val))

# Define class labels for binary classification
class_labels = ['No', 'Yes']

# Create a heatmap of the confusion matrix
plt.figure(figsize=(4, 3))
sns.heatmap(xgb_conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_val, best_model.predict(X_val)).ravel()

In [None]:
tn, fp, fn, tp

In [None]:
from sklearn.metrics import precision_score

In [None]:
precision_score(y_val, best_model.predict(X_val))