# Import and Read Data

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
import sys 
import os
import time
from copy import deepcopy
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn import model_selection
from sklearn.metrics.pairwise import normalize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.inspection import plot_partial_dependence
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
original_data = pd.read_csv('/content/drive/MyDrive/NewFairML/data/USAll.csv',dtype=object)

In [None]:
original_overdose_data_with_original_index = pd.read_csv('/content/drive/MyDrive/NewFairML/data/USAllCnty_SelectedByICD10.csv',dtype=object)
original_overdose_data_with_original_index.columns = ['Original_index'] + original_overdose_data_with_original_index.columns[1:].tolist()
original_overdose_data = original_overdose_data_with_original_index.drop('Original_index',axis=1)

In [None]:
original_no_overdose_data = original_data[~original_data.index.isin(original_overdose_data_with_original_index['Original_index'])]

In [None]:
category_features = [x for x in original_data.columns if original_data[x].dtype != float]
category_features

['General_Record_Type',
 'General_Resident_status',
 'General_Place_of_death_and_decedents_status',
 'General_Day_of_week_of_death',
 'General_Data_year',
 'General_Manner_of_death',
 'Occurrence_State(FIPS)',
 'Occurrence_County(FIPS)',
 'Occurrence_County_Population_size',
 'Residence_State(FIPS)',
 'Residence_County(FIPS)',
 'Residence_Met_or_Nonmet_county',
 'The_Decedent_Date_of_Death',
 'The_Decedent_Sex',
 'The_Decedent_Age',
 'The_Decedent_Marital_status',
 'The_Decedent_State_or_country_of_birth',
 'The_Decedent_Education',
 'Underlying_Cause_ICD-10_code',
 'The_Decedent_Race_Recode_5',
 'The_Decedent_Hispanic_Origin/Race_Recode']

In [None]:
original_data["General_Data_year"].value_counts()

2019    2833007
2018    2817121
2017    2789825
2016    2718635
2015    2686900
2014    2600327
2013    2570146
2012    2516220
2011    2487774
2010    2439904
Name: General_Data_year, dtype: int64

In [None]:
len(original_data["General_Data_year"])

26459859

# Make the Value Interpretable

In [None]:
original_data['The_Decedent_Race_Recode_5'].value_counts()

1    22542289
2     3096371
4      641850
3      179349
Name: The_Decedent_Race_Recode_5, dtype: int64

In [None]:
#change the value of Race to be 5+1
#original_data
original_data['The_Decedent_Race_Recode_5'] = np.where(original_data['The_Decedent_Hispanic_Origin/Race_Recode'].isin(['1','2','3','4','5']), '6', original_data['The_Decedent_Race_Recode_5'])

In [None]:
#original_no_overdose_data
original_no_overdose_data['The_Decedent_Race_Recode_5'] = np.where(original_no_overdose_data['The_Decedent_Hispanic_Origin/Race_Recode'].isin(['1','2','3','4','5']), '6', original_no_overdose_data['The_Decedent_Race_Recode_5'])

In [None]:
#original_overdose_data_with_original_index
original_overdose_data_with_original_index['The_Decedent_Race_Recode_5'] = np.where(original_overdose_data_with_original_index['The_Decedent_Hispanic_Origin/Race_Recode'].isin(['1','2','3','4','5']), '6', original_overdose_data_with_original_index['The_Decedent_Race_Recode_5'])


In [None]:
#original_overdose_data
original_overdose_data['The_Decedent_Race_Recode_5'] = np.where(original_overdose_data['The_Decedent_Hispanic_Origin/Race_Recode'].isin(['1','2','3','4','5']), '6', original_overdose_data['The_Decedent_Race_Recode_5'])

In [None]:
original_data = original_data.assign(The_Decedent_Race=original_data['The_Decedent_Race_Recode_5'])

In [None]:
original_no_overdose_data = original_no_overdose_data.assign(The_Decedent_Race=original_no_overdose_data['The_Decedent_Race_Recode_5'])
original_overdose_data_with_original_index = original_overdose_data_with_original_index.assign(The_Decedent_Race=original_overdose_data_with_original_index['The_Decedent_Race_Recode_5'])
original_overdose_data = original_overdose_data.assign(The_Decedent_Race=original_overdose_data['The_Decedent_Race_Recode_5'])

In [None]:
#e.	Age						70-82
original_data['The_Decedent_Age'].unique()

array(['1068', '1012', '1075', '1061', '1046', '1066', '1080', '1096',
       '1079', '1083', '1062', '1065', '1048', '1072', '1047', '1090',
       '1064', '1098', '1088', '1091', '1094', '1076', '1060', '1073',
       '1078', '1043', '1093', '1089', '1025', '1067', '1092', '1050',
       '1087', '1085', '1041', '1082', '1071', '1057', '1036', '1084',
       '1058', '1086', '1069', '1051', '1099', '1059', '1081', '1042',
       '1023', '1026', '1049', '1019', '1052', '1054', '1039', '1056',
       '1028', '1014', '1035', '1021', '1074', '1040', '1027', '1038',
       '1095', '1070', '1063', '1044', '1101', '1055', '1053', '1029',
       '1033', '1045', '1034', '1020', '1037', '1077', '1100', '1030',
       '1031', '1024', '1017', '1032', '1016', '1022', '1097', '1018',
       '1015', '1013', '1103', '1104', '1109', '1106', '1102', '1105',
       '1108', '1110', '1107', '1999', '1111', '1112', '1114', '1115',
       '1113', '1116', '1117', '1126', '1118', '1119'], dtype=object)

In [None]:
#i.	Education					61-64
original_data['The_Decedent_Education'].unique()

array(['2', '1', '4', '9', '3', '8', '5', '6', '7'], dtype=object)

In [None]:
original_data['Occurrence_County_Population_size'].unique()

array(['2', '4', '5', '9', '6', '1', '3', '0'], dtype=object)

In [None]:
category_features = [x for x in original_data.columns if original_data[x].dtype != float]
category_features

['General_Record_Type',
 'General_Resident_status',
 'General_Place_of_death_and_decedents_status',
 'General_Day_of_week_of_death',
 'General_Data_year',
 'General_Manner_of_death',
 'Occurrence_State(FIPS)',
 'Occurrence_County(FIPS)',
 'Occurrence_County_Population_size',
 'Residence_State(FIPS)',
 'Residence_County(FIPS)',
 'Residence_Met_or_Nonmet_county',
 'The_Decedent_Date_of_Death',
 'The_Decedent_Sex',
 'The_Decedent_Age',
 'The_Decedent_Marital_status',
 'The_Decedent_State_or_country_of_birth',
 'The_Decedent_Education',
 'Underlying_Cause_ICD-10_code',
 'The_Decedent_Race_Recode_5',
 'The_Decedent_Hispanic_Origin/Race_Recode',
 'The_Decedent_Race']

In [None]:
original_data['The_Decedent_Race_Recode_5'].unique()

array(['3', '1', '4', '2', '6'], dtype=object)

In [None]:
original_data.to_csv('/content/drive/My Drive/original_data.csv', index=False)

In [None]:
original_no_overdose_data.to_csv('/content/drive/My Drive/original_no_overdose_data.csv', index=False)

In [None]:
original_overdose_data_with_original_index.to_csv('/content/drive/My Drive/original_overdose_data_with_original_index.csv', index=False)

In [None]:
original_overdose_data.to_csv('/content/drive/My Drive/original_overdose_data.csv', index=False)