In [1]:
import pandas as pd
import numpy as np
from prince import FAMD
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

In [2]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [3]:
# Read in DataFrame
df = pd.read_csv('NYSDOH_BRFSS_SurveyData_2020.csv', encoding = 'cp1252')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# View first 5 rows of the data

df.head(5)

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,COLGSEX,LANDSEX,RESPSLCT,...,PPS_40,PPS_43,PPS_44,PPS_45,PPS_46,PPS_48,PPS_52,SEXNOCON_OTH,NOVEGFRU_OTH,CHILDREN
0,New York,1152020,1,15,2020,2020002528,2020002528,Not asked or Missing,Not asked or Missing,Male,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
1,New York,1302020,1,30,2020,2020002529,2020002529,Not asked or Missing,Not asked or Missing,Male,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
2,New York,1152020,1,15,2020,2020002530,2020002530,Not asked or Missing,Female,Not asked or Missing,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
3,New York,2032020,2,3,2020,2020004509,2020004509,Not asked or Missing,Not asked or Missing,Female,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing
4,New York,2152020,2,15,2020,2020002531,2020002531,Not asked or Missing,Not asked or Missing,Female,...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,Data do not meet the criteria for statistical ...,,,Not asked or Missing


In [5]:
# Use df_clean for cleaning
df_clean = df.copy()

# Repalce 'Not asked or Missing' and 'Data do not meet the criteria for statistical reliability, 
# data quality or confidentiality (data are suppressed)' with NA
for col in df.columns:
    df_clean[col].replace({'Not asked or Missing' : np.nan}, inplace = True)
    df_clean[col].replace({'Data do not meet the criteria for statistical reliability, data quality or confidentiality (data are suppressed)' : np.nan}, inplace = True)
    
# Drop columns with over 80% missing values
df_clean.dropna(axis = 1, thresh = len(df_clean) * .50, inplace = True)

In [6]:
# View first 5 rows of DataFrame after cleaning out missing values
df_clean.head(5)

Unnamed: 0,_STATE,IDATE,IMONTH,IDAY,IYEAR,SEQNO,_PSU,CELLSEX,SEXVAR,GENHLTH,...,WTRSOURCE,STRSMEAL,FRUITVEG,MJUSE30,MJNYSMMP,HEPCTEST,HEPCTOLD,_WT2SPLITS,REGION,DSRIPREG
0,New York,1152020,1,15,2020,2020002528,2020002528,,Male,Very good,...,Public Water Supply,Always,,,No,No,No,5130.843243,NYS exclusive of NYC,Long Island
1,New York,1302020,1,30,2020,2020002529,2020002529,,Male,Very good,...,Public Water Supply,Never,,,No,No,No,941.550458,NYS exclusive of NYC,Long Island
2,New York,1152020,1,15,2020,2020002530,2020002530,,Female,Good,...,Public Water Supply,Never,,,No,No,No,3497.366203,NYS exclusive of NYC,Long Island
3,New York,2032020,2,3,2020,2020004509,2020004509,,Female,Good,...,Public Water Supply,Never,Neighborhood,,No,,,1187.70903,NYS exclusive of NYC,Long Island
4,New York,2152020,2,15,2020,2020002531,2020002531,,Male,Fair,...,Don't Know/Not Sure,Rarely,,,No,No,No,13364.387863,NYS exclusive of NYC,Long Island


In [7]:
# Select relevant columns related to heart disease by utilizing resources detailing factors of heart disease
# shorturl.at/oqwF5 - Behavioral risk factors of coronary artery disease: A paired matched case control study
# shorturl.at/cpAXZ - Strategies to prevent heart disease
# shorturl.at/gpwAR - Top five habits that harm the heart
# shorturl.at/mtJUZ - 9 Common Habits That Are Bad for Your Heart

list(df_clean.columns)

['_STATE',
 'IDATE',
 'IMONTH',
 'IDAY',
 'IYEAR',
 'SEQNO',
 '_PSU',
 'CELLSEX',
 'SEXVAR',
 'GENHLTH',
 'PHYSHLTH',
 'MENTHLTH',
 'HLTHPLN1',
 'PERSDOC2',
 'MEDCOST',
 'CHECKUP1',
 'EXERANY2',
 'SLEPTIM1',
 'CVDINFR4',
 'CVDCRHD4',
 'CVDSTRK3',
 'ASTHMA3',
 'CHCSCNCR',
 'CHCOCNCR',
 'CHCCOPD2',
 'HAVARTH4',
 'ADDEPEV3',
 'CHCKDNY2',
 'DIABETE4',
 'LASTDEN4',
 'RMVTETH4',
 'AGE',
 'HISPANC3',
 'MRACE1',
 'MARITAL',
 'EDUCA',
 'RENTHOM1',
 'VETERAN3',
 'EMPLOY1',
 'INCOME2',
 'WEIGHT2',
 'HEIGHT3',
 'DEAF',
 'BLIND',
 'DECIDE',
 'DIFFWALK',
 'DIFFDRES',
 'DIFFALON',
 'SMOKE100',
 'USENOW3',
 'ALCDAY5',
 'AVEDRNK3',
 'DRNK3GE5',
 'MAXDRNKS',
 'FLUSHOT7',
 'SHINGLE2',
 'PNEUVAC4',
 'FALL12MN',
 'FALLINJ4',
 'SEATBELT',
 'DRNKDRI2',
 'COLNSCPY',
 'SIGMSCPY',
 'BLDSTOL1',
 'STOOLDNA',
 'VIRCOLON',
 'HIVTST7',
 'HIVRISK5',
 'PDIABTST',
 'PREDIAB1',
 'ECIGARET',
 'CNCRAGE',
 'CNCRTYP1',
 'BIRTHSEX',
 'SOFEMALE',
 'ADHISPA',
 'QSTVER',
 'QSTLANG',
 '_URBNRRL',
 '_METSTAT',
 '_URBSTAT',
 'MSCO

In [8]:
# Characteristics
# 1. SEXVAR - Sex - (Male or Female)
# 2. _IMPAGE - Age - (Age 65 or older, Age 55 - 64, Age 45 - 54, Age 35 - 44, Age 25 - 34, Age 18 - 24)
# 3. _IMPRACE - Race - (White, Non-Hispanic, Hispanic, Black, Non-Hispanic, Other race, Non-Hispanic, Asian, Non-Hispanic,
# American Indian/Alaskan Native, Non-Hispanic)
# 4. VETERAN3 - Former veteran status - (Yes, No, Refused, Don't know/Not sure)
# 5. WTKG3 - Weight in KG - (Continous value)
# 6. _IMPMRTL - Marital status - (Married, Never Married, Divorced, Widowed, A member of an unmarried couple, 
# Separated)
# 7. _RFBMI5 - Overweight or Obese - (Yes, No, Don’t know/Refused/Missing)


# Health 
# 8. HLTHPLN1 - Has Healthcare Coverage - (Yes, No, Don't know/Not sure, Refused)
# 9. ADDEPEV3 - Diagnosed with depression - (Yes, No, Don't know/Not sure, Refused)
# 10. DIABETE4 - Diagnosed with diabetes - (Yes, Yes, but female told only during pregnancy, 
# No, pre-diabetes or borderline diabetes, No, Don't know/Not sure, Refused)
# 11. RMVTETH4 - Number of teeth removed - All, 6 or more, but not all, 1 to 5, None, Don't know/Not sure, Refused
# 12. _PHYS14D - Number of days physical health not well - (Zero days when physical health not good,     
# 1-13 days when physical health not good, 14+ days when physical health not good, Don’t know/Refused/Missing)                 
# 13. _MENT14D - Number of days mental health not well - Zero days when mental health not good
# 1-13 days when mental health not good, 14+ days when mental health not good, Don’t know/Refused/Missing    
# 14. _TOTINDA - Physical activity - (Had physical activity or exercise, No physical activity or exercise in last 30 days,     
# Don’t know/Refused/Missing)       
# 15. PDIABTST - User has gotten a test for high blood sugar in past 3 years - (Yes, No, Don't know/Not sure, Refused)
# 16. PREDIAB1 - Diagnosed as prediabetic - Yes, Yes, during pregnancy, Don't know/Not Sure, Refused, No
# 17. _RFHLTH - General health - (Good or Better Health, Fair or Poor Health, Don’t know/Not Sure Or Refused/Missing)
# 18. BPHIGH4 - (Told they have high blood pressure - Yes, Told borderline high or pre-hypertensive, 
# Yes, but female told only during pregnancy, Don't Know/Not Sure Refused, No) 

# Lifestyle
# 19. CHECKUP1 - Length since last checkup - (Within past year (anytime less than 12 months ago), 
# Within past 2 years (1 year but less than 2 years ago), Within past 5 years (2 years but less than 5 years ago), 
# 5 or more years ago, Don’t know/Not sure, Never, Refused)
# 20. LASTDEN4 - Last visited dentist - (Within past year (anytime less than 12 months ago), 
# Within past 2 years (1 year but less than 2 years ago), Within past 5 years (2 years but less than 5 years ago), 
# 5 or more years ago, Don’t know/Not sure, Never, Refused)
# 21. FLUSHOT7 - Whether someone has taken the flu shot - (Yes, No, Don't know/Not sure, Refused)
# 22. _RFSEAT3 - Seatbeat wearing status - (Always Wear Seat Belt, Don’t Always Wear Seat Belt
# Don’t know/Not Sure Or Refused/Missing)

# Socioeconomic status
# 23. _IMPEDUC - Education - (College 4 years or more (College graduate), 
# College 1 year to 3 years (Some college or technical school), Grade 12 or GED (High school graduate), 
# Grades 9 through 11 (Some high school), Grades 1 through 8 (Elementary), Never attended school or only kindergarten)
# Grades 9 through 11 (Some high school), Grades 1 through 8 (Elementary), Never attended school or only kindergarten)
# 24. EMPLOY1 - 
# 25. _INCOMG - Income level - ($50,000 or more, Don’t know/Not sure/Missing, $15,000 to less than $25,000,   
# $35,000 to less than $50,000, $25,000 to less than $35,000, Less than $15,000)
# 26. _METSTAT - Whether they live in a metropolitan - (1, 2)

# Tobacco, Alcohol
# 27. USENOW3 - Use of smokeless tobacco - (Not at all, Some days, Every day, Refused, Don’t know/Not Sure) 
# 28. ECIGARET - E-ciggarette usage - (Yes, No, Don't know/Not sure, Refused)
# 29. _SMOKER3 - Smoking status - (Current smoker - now smokes every day, Current smoker - now smokes some days,
# Former smoker, Never smoked, Don’t know/Refused/Missing
# 30. _RFBING5 - Binge drinking status - (Yes, No, Don’t know/Refused/Missing)                

# Columns to keep - Response variable
# 31. CVDINFR4 - Ever diagnosed with heart attack - (Yes, No, Don't know/Not sure, Refused)
# 32. CVDCRHD4 - Ever diagnosed with angina/ coronary heart disease - (Yes, No, Don't know/Not sure, Refused)

# For now we will predict heart disease
df_clean_columns = df_clean[['SEXVAR', '_IMPAGE', '_IMPRACE', 'VETERAN3', 'WTKG3', '_IMPMRTL', '_RFBMI5', 
                             'HLTHPLN1', 'ADDEPEV3', 'DIABETE4', 'RMVTETH4', '_PHYS14D', '_MENT14D', '_TOTINDA',
                             'PDIABTST', 'PREDIAB1', '_RFHLTH', 'BPHIGH4', 'CHECKUP1', 'LASTDEN4', 'FLUSHOT7', 
                             '_RFSEAT3', '_IMPEDUC', 'EMPLOY1', '_INCOMG', '_METSTAT', 'USENOW3', 'ECIGARET',
                             '_SMOKER3', '_RFBING5', 'CVDCRHD4']]

In [9]:
# View non-null count 
df_clean_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14769 entries, 0 to 14768
Data columns (total 31 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEXVAR    14769 non-null  object 
 1   _IMPAGE   14769 non-null  object 
 2   _IMPRACE  14769 non-null  object 
 3   VETERAN3  14702 non-null  object 
 4   WTKG3     12872 non-null  float64
 5   _IMPMRTL  14769 non-null  object 
 6   _RFBMI5   14769 non-null  object 
 7   HLTHPLN1  14769 non-null  object 
 8   ADDEPEV3  14769 non-null  object 
 9   DIABETE4  14769 non-null  object 
 10  RMVTETH4  14769 non-null  object 
 11  _PHYS14D  14769 non-null  object 
 12  _MENT14D  14769 non-null  object 
 13  _TOTINDA  14769 non-null  object 
 14  PDIABTST  8367 non-null   object 
 15  PREDIAB1  8367 non-null   object 
 16  _RFHLTH   14769 non-null  object 
 17  BPHIGH4   9454 non-null   object 
 18  CHECKUP1  14769 non-null  object 
 19  LASTDEN4  14769 non-null  object 
 20  FLUSHOT7  13586 non-null  ob

In [10]:
"""
# Encode categorical values 
cols = list(df_clean_columns.columns)
for col in cols:
    if str(df_clean_columns[col].dtype) == 'object':
        df_clean_columns[col] = df_clean_columns[col].astype('category').cat.codes
"""    

"\n# Encode categorical values \ncols = list(df_clean_columns.columns)\nfor col in cols:\n    if str(df_clean_columns[col].dtype) == 'object':\n        df_clean_columns[col] = df_clean_columns[col].astype('category').cat.codes\n"

In [11]:
from sklearn.model_selection import train_test_split 

# Drop all missing values
df_cleaned = df_clean_columns.dropna(axis = 0).reset_index(drop = True)

# Drop all rows that are Don't know/Not sure or Refused for column we are predicting
df_cleaned = df_cleaned.loc[(df_cleaned['CVDCRHD4'] == 'No') | (df_cleaned['CVDCRHD4'] == 'Yes')]

# Split into X and y
X = df_cleaned.loc[:, df_cleaned.columns != 'CVDCRHD4']
y = df_cleaned['CVDCRHD4']

# Split the data into training and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3,random_state = 42)

In [12]:
famd = FAMD(n_components = 70, n_iter = 3, random_state = 42)
X_train_transformed = famd.fit_transform(X_train)
X_train_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
3827,0.797258,-0.972836,-1.221411,0.606785,0.479787,0.077012,-0.460336,-0.222777,0.129687,-0.271141,...,-0.413219,-0.892093,0.603104,0.105377,0.785151,-0.025847,-1.018354,0.744971,0.426398,0.211848
4187,0.143767,-0.589692,-0.198415,0.688371,-0.053632,0.648238,-0.065917,0.088343,0.528163,0.019436,...,-0.391478,-0.0875,0.124432,0.275594,-0.830494,0.436545,-0.332526,0.602973,0.932549,0.199336
47,-1.601752,0.351065,-0.031086,-0.590164,-0.719433,0.068745,-0.068341,0.091693,-0.270623,0.305822,...,-0.10944,-0.001036,-0.143817,-0.008443,-0.041642,0.112156,-0.152616,0.065756,0.036209,-0.123274
3234,-1.215466,0.072693,-0.176921,-0.220023,-0.362676,0.194385,-0.180646,0.39282,0.179858,-0.627741,...,-0.454379,0.087092,-0.017005,0.851194,-0.133447,0.316281,0.950585,-0.300211,-0.076235,-0.009274
1355,-0.630776,-0.085252,0.548242,-0.652094,0.256872,-0.307713,0.229843,0.183318,-0.049219,-0.089106,...,0.183627,0.141862,0.166942,-0.129888,0.037878,0.113344,0.189862,0.15984,0.080136,-0.167398


In [13]:
RANDOM_SEED = 694

#Run dummy classifier to find the baseline performance
dummy_clf = DummyClassifier(strategy= 'most_frequent').fit(famd.fit_transform(X_train),y_train)
y_pred = dummy_clf.predict(famd.fit_transform(X_test))

In [14]:
#helper function
def get_performance_scores(y_pred, y_true):
    f1 = f1_score(y_true, y_pred, average='macro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return [f1, accuracy, precision, recall]

def print_performance_scores(scores):
    print("Accuracy Score = " + str(scores[1]))
    print("Precision Score = " + str(scores[2]))
    print("Recall Score = " + str(scores[3]))
    print("F1 Score = " + str(scores[0]))

In [15]:
#Find the performance of the model
dummy_f1 = f1_score(y_test, y_pred, average='macro')
dummy_accuracy = accuracy_score(y_test, y_pred)
dummy_precision = precision_score(y_test, y_pred, average='macro')
dummy_recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy Score = " + str(dummy_accuracy))
print("Precision Score = " + str(dummy_precision))
print("Recall Score = " + str(dummy_recall))
print("F1 Score = " + str(dummy_f1))

Accuracy Score = 0.9543707973102786
Precision Score = 0.4771853986551393
Recall Score = 0.5
F1 Score = 0.4883263701155075


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
clf_lr = LogisticRegression(random_state = RANDOM_SEED).fit(X_train_transformed, y_train)

In [17]:
X_test_transformed = famd.fit_transform(X_test)
train_preds = clf_lr.predict(X_test_transformed)

In [18]:
train_preds

array(['No', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [19]:
lr_f1 = f1_score(y_test, train_preds, average='macro')
lr_accuracy = accuracy_score(y_test, train_preds)
lr_precision = precision_score(y_test, train_preds, average='macro')
lr_recall = recall_score(y_test, train_preds, average='macro')
print("Accuracy Score = " + str(lr_accuracy))
print("Precision Score = " + str(lr_precision))
print("Recall Score = " + str(lr_recall))
print("F1 Score = " + str(lr_f1))

Accuracy Score = 0.9529298751200769
Precision Score = 0.6967502164410753
Recall Score = 0.5493603157364977
F1 Score = 0.5726350360682658


In [20]:
#Do grid search for hyperparameter tuning
clf = LogisticRegression(random_state = RANDOM_SEED, solver='liblinear')
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1_macro')
my_list = list(range(100))
for x in tqdm(my_list):
    grid_clf_acc.fit(X_train_transformed, y_train)

y_pred_acc = grid_clf_acc.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred_acc, y_test))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]

Accuracy Score = 0.94716618635927
Precision Score = 0.6153777896091219
Recall Score = 0.5413291658941012
F1 Score = 0.556685146184341


In [21]:
from sklearn.metrics import confusion_matrix,roc_curve, roc_auc_score

# Best scores

w = {'No' : 0.04, 'Yes' : 0.96}
clf_lr = LogisticRegression(penalty = 'l2', random_state = RANDOM_SEED, class_weight = w, solver = 'liblinear', C = 0.75).fit(X_train_transformed, y_train)
X_train_transformed = famd.fit_transform(X_train)
X_test_transformed = famd.fit_transform(X_test)
train_preds = (clf_lr.predict_proba(X_test_transformed)[:,1] >= 0.85).astype(int)
train_preds = pd.DataFrame(train_preds, columns = ['val'])
train_preds = train_preds['val'].replace(to_replace = [0, 1], value = ['No', 'Yes'])

lr_f1 = f1_score(y_test, train_preds, average = 'macro')
lr_accuracy = accuracy_score(y_test, train_preds)
lr_precision = precision_score(y_test, train_preds, average = 'macro')
lr_recall = recall_score(y_test, train_preds, average = 'macro')
print("Accuracy Score = " + str(lr_accuracy))
print("Precision Score = " + str(lr_precision))
print("Recall Score = " + str(lr_recall))
print("F1 Score = " + str(lr_f1))
print(f'Confusion Matrix: \n{confusion_matrix(y_test, train_preds)}')

Accuracy Score = 0.9303554274735831
Precision Score = 0.5778109756097561
Recall Score = 0.5676025746298308
F1 Score = 0.5722113819064502
Confusion Matrix: 
[[1921   66]
 [  79   16]]


In [22]:
from imblearn.over_sampling import SMOTENC

os = SMOTENC(categorical_features = [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29], random_state = 0)
os_data_X , os_data_y = os.fit_resample(X_train, y_train)
famd = FAMD(n_components = 107, n_iter = 3, random_state = 42)
X_train_os = famd.fit_transform(os_data_X)
clf_lr = LogisticRegression(random_state = RANDOM_SEED, solver = 'liblinear').fit(X_train_os, os_data_y.values.ravel())
X_test_transformed = famd.fit_transform(X_test)
train_preds = (clf_lr.predict_proba(X_test_transformed)[:,1] >= 0.8).astype(int)
train_preds = pd.DataFrame(train_preds, columns = ['val'])
train_preds = train_preds['val'].replace(to_replace = [0, 1], value = ['No', 'Yes'])

lr_f1 = f1_score(y_test, train_preds, average='macro')
lr_accuracy = accuracy_score(y_test, train_preds)
lr_precision = precision_score(y_test, train_preds, average='macro')
lr_recall = recall_score(y_test, train_preds, average='macro')
print("Accuracy Score = " + str(lr_accuracy))
print("Precision Score = " + str(lr_precision))
print("Recall Score = " + str(lr_recall))
print("F1 Score = " + str(lr_f1))
print(f'Confusion Matrix: \n{confusion_matrix(y_test, train_preds)}')

Accuracy Score = 0.6455331412103746
Precision Score = 0.46914141386942687
Recall Score = 0.3482213334039679
F1 Score = 0.39486374040810923
Confusion Matrix: 
[[1342  645]
 [  93    2]]


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(random_state=RANDOM_SEED).fit(X_train_transformed, y_train)

grid_values = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
 }

rand_search_clf = RandomizedSearchCV(estimator = rf_clf, param_distributions = grid_values, n_iter = 1, cv = 3, verbose=2, random_state=RANDOM_SEED, n_jobs = -1)

my_list = list(range(100))
for x in tqdm(my_list):
    rand_search_clf.fit(X_train_transformed[0:10000], y_train[0:10000])


y_pred_rf = rand_search_clf.predict(X_test_transformed)
   


print_performance_scores(get_performance_scores(y_pred_rf, y_test))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x in tqdm(my_list):


  0%|          | 0/100 [00:00<?, ?it/s]

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

KeyboardInterrupt: 

In [None]:
random_forest = RandomForestClassifier(n_estimators = 100, random_state = RANDOM_SEED )
random_forest.fit(X_train_transformed, y_train)

y_pred = random_forest.predict(X_test_transformed)

print_performance_scores(get_performance_scores(y_pred, y_test))

Accuracy Score = 0.9538904899135446
Precision Score = 0.4771744353676117
Recall Score = 0.49974836436839454
F1 Score = 0.4882005899705014
