In [33]:
import pandas as pd
import numpy as np
import math as m
import random as rand
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn import linear_model as lm, metrics, ensemble as ens
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
import random
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")


# Clinical Data updated_ Alex

In [34]:
#DEFINING A FUNCTION TO UPDATE COLUMN NAMES LATER
def lower_no_space(word): 
    
    word = re.sub(' ', '_', word) 
    
    word = re.sub(r'\'', '', word) 
    
    word = re.sub(r'\(', '', word)
    
    word = re.sub(r'\)', '', word)
    
    word = re.sub('\?', '', word)
    
    word = re.sub('/', '_', word)
    
    word = word.lower()
    
    return word

In [35]:
#READ IN Updated CLINICAL DATA FOR LATER USE (CONVERTED TO .csv IN GOOGLE SHEETS)
df_clin_updated = pd.read_csv("Homebase_new_updated.csv", header = 1)

In [36]:
#RENAMING COLUMNS
df_clin_updated = df_clin_updated.rename(mapper = lower_no_space, axis = 1) 
df_clin_updated.rename(columns={'subject_sample_id':'sample_id'}, inplace=True)

In [37]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

Unnamed: 0,patient_id,data_access_group,survey_identifier,survey_timestamp,sample_id,date_of_birth,current_age_at_time_of_data_entry_-_can_answer_instead_of_dob,country_of_origin,country_of_residence,gender,...,age_at_date_of_relapse_disease_progression_4_can_answer_instead_of_date_of_relapse_disease_progression_4,date_of_relapse_disease_progression.4,age_at_date_of_relapse_disease_progression_5_can_answer_instead_of_date_of_relapse_disease_progression_5,disease_status_at_time_of_sampling,disease_status_at_time_of_data_entry,date_of_death,age_at_death_can_answer_instead_of_date_of_death,death_related_to_disease,cause_of_death,complete
0,795-1,MD Anderson,,,spz-20,1933-03-08,,,United States of America,Female,...,,,,Progressive disease,Deceased,2013-11-06,,Yes,,Complete
1,795-2,MD Anderson,,,spz-3,1930-09-04,,,United States of America,Female,...,,"12-16-2013, 05-19-2014, 09-16-2014, 09-09-2015",,Progressive disease,Deceased,2015-10-26,,Yes,,Complete
2,795-3,MD Anderson,,,spz-4,1926-12-27,,,United States of America,Male,...,,,,Stable disease,Deceased,2014-09-14,,Yes,,Complete
3,795-4,MD Anderson,,,spz-8,1945-08-17,,,United States of America,Male,...,,,,Progressive disease,Deceased,2018-01-05,,No,,Complete
4,795-5,MD Anderson,,,spz-10,1925-04-28,,,United States of America,Male,...,,,,Progressive disease,Deceased,2017-09-25,,No,,Complete


In [38]:
#Compute the age at initial diagnosis from date of birth and date_of_initial_diagnosis
df_clin_updated['date_of_birth'] = pd.to_datetime(df_clin_updated['date_of_birth'])
df_clin_updated['date_of_initial_diagnosis'] = pd.to_datetime(df_clin_updated['date_of_initial_diagnosis'])
df_clin_updated["age_at_initial_diagnosis"] = (pd.DatetimeIndex(df_clin_updated['date_of_initial_diagnosis']).year 
                        - pd.DatetimeIndex(df_clin_updated['date_of_birth']).year)

In [39]:
#Due to the abnormal in date of birth from the Stanford data, 
#Remove the age at initial diagonosis for data from Stanford & the one that has negative age 
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated['data_access_group'] == 'Stanford', np.nan, df_clin_updated["age_at_initial_diagnosis"])
df_clin_updated["age_at_initial_diagnosis"] = np.where(df_clin_updated["age_at_initial_diagnosis"] < 0, np.nan, df_clin_updated["age_at_initial_diagnosis"])


In [40]:
#Change the data type: date_of_birth, n, m 
df_clin_updated = df_clin_updated.astype({'t':'object', 'b':'object'})


In [41]:
#TONS OF DATA, PULL WHAT WE WANT
df_clin_updated_lean = df_clin_updated.drop(columns = [x for x in df_clin_updated.columns if x not in ['gender', 'race', \
                                       'country_of_residence', 'sample_id', 'ethnicity',\
                                        'age_at_initial_diagnosis',\
                                        'lymph_node_biopsy_performed','predominant_lesion_type_at_diagnosis',\
                                        'family_history_of_leukemia_lymphoma', 't', 'n', 'm', 'b',\
                                        'has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical',\
                                        'cd4+:cd8+_ratio', 'total_lymphocyte_count', 'absolute_cd4+_count_per_ul',\
                                        '%cd4+cd26-', '%cd4+cd7-', 'tcr_clonality', 'tumor_cell_cd30+',\
                                        'large_cell_transformation', 'ldh_u_l', 'wbc_103_μl', 'rbc_106_μl',\
                                        'hematocrit_%', 'mcv_fl', 'mchc_g_dl', 'rdw_%', 'platelet_count_103_μl',\
                                        'segmented_neutrophil,_absolute_103_μl', 'lymphocyte,_absolute_103_μl',\
                                        'monocytes,_absolute_103_μl', 'eosinophils,_absolute_103_μl',\
                                        'basophils,_absolute_103_μl', 'segmented_neutrophils_%', 'lymphocytes_%',\
                                        'monocytes_%', 'eosinophils_%', 'basophils_%']])

# 'predominant_lesion_type_at_diagnosis', 
# 't', 'n', 'm', 'b', 
# 'eosinophils,_absolute_103_μl'


#wbc_103mul
#rbc_106ml
#nuetrophil_number

In [42]:
# TURN YES/NO & POSITIVE/NEGATIVE TO DUMMIES
df_clin_updated_lean['lymph_node_biopsy_performed'] = \
df_clin_updated_lean['lymph_node_biopsy_performed'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['family_history_of_leukemia_lymphoma'] = \
df_clin_updated_lean['family_history_of_leukemia_lymphoma'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tumor_cell_cd30+'] = \
df_clin_updated_lean['tumor_cell_cd30+'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['large_cell_transformation'] = \
df_clin_updated_lean['large_cell_transformation'].apply(lambda x: 1 if x == 'Yes' else 0)

df_clin_updated_lean['tcr_clonality'] = \
df_clin_updated_lean['tcr_clonality'].apply(lambda x: 1 if x == 'Positive' else 0)

df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'] = \
df_clin_updated_lean['has_the_patient_ever_been_exposed_at_work_or_in_the_service_to_a_toxic_chemical'].apply(lambda x: 1 if x == 'Yes' else 0)

### df_lean: Preprocessed Genetic Data

In [43]:
# Read in the Preprocessed Genetic Data
df_lean = pd.read_csv ('stats_by_sample.csv')

In [44]:
#CHECK CLINICAL DATA BASICS
df_clin_updated.head()

Unnamed: 0,patient_id,data_access_group,survey_identifier,survey_timestamp,sample_id,date_of_birth,current_age_at_time_of_data_entry_-_can_answer_instead_of_dob,country_of_origin,country_of_residence,gender,...,date_of_relapse_disease_progression.4,age_at_date_of_relapse_disease_progression_5_can_answer_instead_of_date_of_relapse_disease_progression_5,disease_status_at_time_of_sampling,disease_status_at_time_of_data_entry,date_of_death,age_at_death_can_answer_instead_of_date_of_death,death_related_to_disease,cause_of_death,complete,age_at_initial_diagnosis
0,795-1,MD Anderson,,,spz-20,1933-03-08,,,United States of America,Female,...,,,Progressive disease,Deceased,2013-11-06,,Yes,,Complete,73.0
1,795-2,MD Anderson,,,spz-3,1930-09-04,,,United States of America,Female,...,"12-16-2013, 05-19-2014, 09-16-2014, 09-09-2015",,Progressive disease,Deceased,2015-10-26,,Yes,,Complete,81.0
2,795-3,MD Anderson,,,spz-4,1926-12-27,,,United States of America,Male,...,,,Stable disease,Deceased,2014-09-14,,Yes,,Complete,86.0
3,795-4,MD Anderson,,,spz-8,1945-08-17,,,United States of America,Male,...,,,Progressive disease,Deceased,2018-01-05,,No,,Complete,66.0
4,795-5,MD Anderson,,,spz-10,1925-04-28,,,United States of America,Male,...,,,Progressive disease,Deceased,2017-09-25,,No,,Complete,87.0


In [45]:
#TRANSFORM SAMPLE ID TOJOIN TO CLINICAL DATA
df_lean['sample_id'] = df_lean['sample_id'].apply(lambda x: re.sub('_', '-', x[:5]) if 'WES' in x else\
                                                  (x[:-10] if 'CTCL' in x else \
                                                  (x[:-13] if 'almeida' in x else\
                                                  ((x[-2:]+x[:-2])[:-15] if 'ungewickell' in x else\
                                                  ('-'.join([ele.lstrip('0').lower() for ele in x[:-10].split('-')]) if 'SPZ' in x else x)))))

# Merge (Updated)

In [46]:
#MERGE tbe updated CLINICAL, GENETIC DATA
df_all_updated = pd.merge(df_lean, df_clin_updated_lean, on='sample_id', how='left')

In [47]:
#IMPUTATION; "UNKNOWN" FOR CATEGORICAL, MEAN FILL-IN FOR CONTINUOUS
for col in df_clin_updated_lean.columns:
    if col in ['race', 'gender', 'country_of_residence', 'ethnicity', 'predominant_lesion_type_at_diagnosis', 't', 
              'n', 'm', 'b']:
        df_all_updated[col] = df_all_updated[col].fillna('unknown')
    elif col != 'sample_id':
        df_all_updated[col] = df_all_updated[col].fillna(np.mean(df_all_updated[col]))

In [48]:
#GET DUMMIES FOR CATEGORICALS
df_all_updated = pd.get_dummies(df_all_updated, columns = ['race', 'gender', 'country_of_residence', 'ethnicity', 
                                                          'predominant_lesion_type_at_diagnosis', 't', 'n', 'm', 'b'])


In [49]:
df_all_updated['sample_id'].unique()

array(['SS11', 'SS15', 'MF16', 'SS1', 'SS4', 'MF52', 'SS54', 'MF5', 'MF7',
       'MF8', 'MF9', 'CTCL10', 'CTCL11', 'CTCL12', 'CTCL13', 'CTCL15',
       'CTCL16', 'CTCL17', 'CTCL18', 'CTCL19', 'CTCL1', 'CTCL20',
       'CTCL21', 'CTCL22', 'CTCL23', 'CTCL29', 'CTCL2', 'CTCL30',
       'CTCL31', 'CTCL32', 'CTCL34', 'CTCL35', 'CTCL36', 'CTCL38',
       'CTCL39', 'CTCL3', 'CTCL40', 'CTCL4', 'CTCL5', 'CTCL6', 'CTCL7',
       'CTCL8', 'CTCL9', 'CTCL_NU11__a', 'CTCL_NU18__a', 'CTCL_NU19__a',
       'CTCL_NU20__a', 'CTCL_NU2__a', 'CTCL_NU4__a', 'CTCL_NU7__a',
       'CTCL_NU9__a', 'Patient_1__mcgirt__MF', 'Patient_2__mcgirt__MF',
       'Patient_3__mcgirt__MF', 'Patient_4__mcgirt__MF',
       'Patient_5__mcgirt__MF', 'spz-1', 'spz-2', 'spz-3', 'spz-4',
       'spz-5', 'spz-6', 'spz-7', 'spz-8', 'spz-9', 'spz-10', 'spz-11',
       'spz-12', 'spz-13', 'spz-14', 'spz-15', 'spz-16', 'spz-17',
       'spz-18', 'spz-19', 'spz-20', 'spz-21', 'spz-22', 'spz-23',
       'spz-24', 'spz-25', 'spz-26', 's

In [87]:
df_lean.shape

(139, 22753)

# Defining Features and Labels - For updated data

## Yale

In [50]:
X_train_y = df_all_updated[df_all_updated['sample_id'].str.contains("CTCL")== False]
X_test_y = df_all_updated[df_all_updated['sample_id'].str.contains("CTCL")== True]

In [51]:
#DEFINE STANDARDSCALER FOR LATER USE
std_scl = StandardScaler()

In [52]:
# Define (Scaled/Normalized) Features and Labels
X_new_y = X_train_y.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled_y = std_scl.fit_transform(X_new_y)
X_new_norm_y = normalize(X_new_y)

y_new_y = X_train_y.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [53]:
X_test_yale = X_test_y.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_test_scaled_y = std_scl.transform(X_test_yale)
X_test_norm_y = normalize(X_test_yale)

y_test_yale = X_test_y.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [54]:
X_train_y['outcome'].value_counts(normalize = True)

1    0.8
0    0.2
Name: outcome, dtype: float64

In [55]:
X_test_y['outcome'].value_counts(normalize = True)

1    0.8
0    0.2
Name: outcome, dtype: float64

## IMM - Barcelona

In [56]:
X_train_b = df_all_updated[df_all_updated['sample_id'].str.contains("prasad")== False]
X_test_b = df_all_updated[df_all_updated['sample_id'].str.contains("prasad")== True]

# Define (Scaled/Normalized) Features and Labels
X_new_b = X_train_b.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled_b = std_scl.fit_transform(X_new_b)
X_new_norm_b = normalize(X_new_b)
y_new_b = X_train_b.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])


X_test_bar = X_test_b.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_test_scaled_bar = std_scl.transform(X_test_bar)
X_test_norm_bar = normalize(X_test_bar)
y_test_bar = X_test_b.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [57]:
X_train_b['outcome'].value_counts(normalize = True)

1    0.78125
0    0.21875
Name: outcome, dtype: float64

In [58]:
X_test_b['outcome'].value_counts(normalize = True)

1    1.0
Name: outcome, dtype: float64

## Kings College

In [80]:
X_train_k = df_all_updated[df_all_updated['sample_id'].str.contains("WES")== False]
X_test_k = df_all_updated[df_all_updated['sample_id'].str.contains("WES")== True]

# Define (Scaled/Normalized) Features and Labels
X_new_k = X_train_k.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_new_scaled_k = std_scl.fit_transform(X_new_k)
X_new_norm_k = normalize(X_new_k)
y_new_k = X_train_k.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])


X_test_kings = X_test_k.drop(columns = [x for x in df_all_updated.columns if x == 'outcome' or x == 'sample_id'])
X_test_scaled_kings = std_scl.transform(X_test_kings)
X_test_norm_kings = normalize(X_test_kings)
y_test_kings = X_test_k.drop(columns = [x for x in df_all_updated.columns if x != 'outcome'])

In [88]:
X_train_k['outcome'].value_counts(normalize = True)

1    0.784615
0    0.215385
Name: outcome, dtype: float64

In [89]:
X_test_k['outcome'].value_counts(normalize = True)

1    1.0
Name: outcome, dtype: float64

In [83]:
print(len(y_test_kings))
print(len(y_test_bar))
print(len(y_test_yale))

10
12
40


# Evaluate

In [63]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

### AdaBoost

#### Yale

In [64]:
#AdaBoost n_estimator = 400
from sklearn.ensemble import AdaBoostClassifier

model_ada = AdaBoostClassifier(n_estimators=400, learning_rate = 1, algorithm = "SAMME.R")

model_ada.fit(X_new_y, y_new_y)
y_pred = model_ada.predict(X_test_yale)

ada_acc_scores = accuracy_score(y_test_yale, y_pred)
ada_prec_scores = precision_score(y_test_yale, y_pred)

print("AdaBoost without PCA")
print('Accuracy: ', ada_acc_scores)
print('Precision: ', ada_prec_scores)


AdaBoost without PCA
Accuracy:  0.4
Precision:  1.0


In [65]:
y_pred

array([0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [66]:
np.ravel(y_test_yale)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [67]:
print(len(y_test_yale))

40


#### Barcelona

In [68]:
#AdaBoost n_estimator = 400
from sklearn.ensemble import AdaBoostClassifier

model_ada = AdaBoostClassifier(n_estimators=400, learning_rate = 1, algorithm = "SAMME.R")

model_ada.fit(X_new_b, y_new_b)
y_pred = model_ada.predict(X_test_bar)

ada_acc_scores = accuracy_score(y_test_bar, y_pred)
ada_prec_scores = precision_score(y_test_bar, y_pred)

print("AdaBoost without PCA")
print('Accuracy: ', ada_acc_scores)
print('Precision: ', ada_prec_scores)

AdaBoost without PCA
Accuracy:  1.0
Precision:  1.0


#### Kings College

In [69]:
#AdaBoost n_estimator = 400
from sklearn.ensemble import AdaBoostClassifier

model_ada = AdaBoostClassifier(n_estimators=400, learning_rate = 1, algorithm = "SAMME.R")

model_ada.fit(X_new_k, y_new_k)
y_pred = model_ada.predict(X_test_kings)

ada_acc_scores = accuracy_score(y_test_kings, y_pred)
ada_prec_scores = precision_score(y_test_kings, y_pred)

print("AdaBoost without PCA")
print('Accuracy: ', ada_acc_scores)
print('Precision: ', ada_prec_scores)

AdaBoost without PCA
Accuracy:  1.0
Precision:  1.0


### Ramdom Forest

#### Yale

In [70]:
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 500, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

model_rf.fit(X_new_y, y_new_y)
y_pred = model_rf.predict(X_test_yale)

rf_acc_scores = accuracy_score(y_test_yale, y_pred)
rf_prec_scores = precision_score(y_test_yale, y_pred)

print("Random Forest")
print('n_components: 12, accuracy: ', rf_acc_scores)
print('n_components: 12, precision: ', rf_prec_scores)

Random Forest
n_components: 12, accuracy:  0.575
n_components: 12, precision:  1.0


In [71]:
y_pred

array([0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [72]:
np.ravel(y_test_yale)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

#### Barcelona

In [73]:
# RF
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 500, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

model_rf.fit(X_new_b, y_new_b)
y_pred = model_rf.predict(X_test_bar)

rf_acc_scores = accuracy_score(y_test_bar, y_pred)
rf_prec_scores = precision_score(y_test_bar, y_pred)

print("Random Forest")
print('n_components: 12, accuracy: ', rf_acc_scores)
print('n_components: 12, precision: ', rf_prec_scores)

Random Forest
n_components: 12, accuracy:  0.8333333333333334
n_components: 12, precision:  1.0


#### Kings College

In [74]:
# RF
steps = [('pca', PCA(n_components=12)), ('m', ens.RandomForestClassifier(n_estimators = 500, criterion = "entropy"))]
model_rf = Pipeline(steps = steps)

model_rf.fit(X_new_k, y_new_k)
y_pred = model_rf.predict(X_test_kings)

rf_acc_scores = accuracy_score(y_test_kings, y_pred)
rf_prec_scores = precision_score(y_test_kings, y_pred)

print("Random Forest")
print('n_components: 12, accuracy: ', rf_acc_scores)
print('n_components: 12, precision: ', rf_prec_scores)

Random Forest
n_components: 12, accuracy:  1.0
n_components: 12, precision:  1.0


### XGBoost

#### Yale

In [75]:
#XGBoost
from numpy import loadtxt
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")

model_xg = XGBClassifier(eta = 0.1, max_depth = 4, scale_pos_weight = 0.25, 
                         eval_metric = "error", use_label_encoder = False)

model_xg.fit(X_new_y, y_new_y)
y_pred = model_xg.predict(X_test_yale)

xg_acc_scores = accuracy_score(y_test_yale, y_pred)
xg_prec_scores = precision_score(y_test_yale, y_pred)

print("XGBoost without PCA")
print('Accuracy: ', xg_acc_scores)
print('Precision: ', xg_prec_scores)

XGBoost without PCA
Accuracy:  0.45
Precision:  1.0


In [76]:
y_pred

array([0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [77]:
np.ravel(y_test_yale)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0])

#### Barcelona

In [78]:
warnings.filterwarnings("ignore")

model_xg = XGBClassifier(eta = 0.1, max_depth = 4, scale_pos_weight = 0.25, 
                         eval_metric = "error", use_label_encoder = False)

model_xg.fit(X_new_b, y_new_b)
y_pred = model_xg.predict(X_test_bar)

xg_acc_scores = accuracy_score(y_test_bar, y_pred)
xg_prec_scores = precision_score(y_test_bar, y_pred)

print("XGBoost without PCA")
print('Accuracy: ', xg_acc_scores)
print('Precision: ', xg_prec_scores)

XGBoost without PCA
Accuracy:  1.0
Precision:  1.0


#### Kings College

In [79]:
warnings.filterwarnings("ignore")

model_xg = XGBClassifier(eta = 0.1, max_depth = 4, scale_pos_weight = 0.25, 
                         eval_metric = "error", use_label_encoder = False)

model_xg.fit(X_new_k, y_new_k)
y_pred = model_xg.predict(X_test_kings)

xg_acc_scores = accuracy_score(y_test_kings, y_pred)
xg_prec_scores = precision_score(y_test_kings, y_pred)

print("XGBoost without PCA")
print('Accuracy: ', xg_acc_scores)
print('Precision: ', xg_prec_scores)

XGBoost without PCA
Accuracy:  1.0
Precision:  1.0
