In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import r2_score
from rfpimp import permutation_importances
from sklearn.impute import KNNImputer
import missingpy 



In [None]:
df = pd.read_csv("COVID_data.csv", sep=',', error_bad_lines=False, index_col=False, dtype='unicode')

In [None]:
class Questions:
    
    # The init function
    def __init__(self, row):
        self.q = row.drop(columns=['SU_ID', 'P_PANEL', 'P_GEO', 'AGE4', 'AGE7', 'GENDER', 'RACETH', 'RACE_R2', 'HHINCOME', 'EDUCATION', 'EDUC4', 'P_OCCUPY', 'MARITAL', 'LGBT', 'HHSIZE1', 'HH01S', 'HH25S', 'HH612S', 'HH1317S', 'H180VS', 'REGION4', 'REGION9', 'P_DENSE', 'MODE', 'LANGUAGE', 'MAIL50', 'RACE1_BANNER', 'RACE2_BANNER', 'INC_BANNER', 'AGE_BANNER', 'HH_BANNER'])


In [None]:
class Patient:
        
    # The init function
    def __init__(self, row):
        self.name = row['SU_ID']
        self.age = row['AGE4']
        self.race = row['RACE1_BANNER']
        self.gender = row['GENDER']        
        self.income = row['HHINCOME']
        self.edu = row['EDUCATION']
        self.loc = row['P_GEO']
        self.size = row['HH_BANNER'] #size is household size
        self.q = Questions(row)

p = []        
        
for i in range(len(df)):
    my_patient = Patient(df.iloc[i])
    p.append(my_patient)
    #print ("My patient is {}".format(my_patient.name) + " and is {}".format(my_patient.age))

In [None]:
df = df.replace('ABS', 1).replace('Amerispeak', 0).drop(columns=['RACE_R2'])

In [None]:
def get_number(df):
    for column in df.columns[2:]:
        for i in df[column]:
            if type(i) == str and i.startswith('('):
                df[column] = df[column].replace(i, i[1:i.index(')')])
            if type(i) == float:
                pass
    return (df)

In [None]:
get_number(df)

In [None]:
## changing strings in column to integers
def change_str_to_int(df, column):
    for i in df[column]:
        if i.startswith('Under'):
            df[column] = df[column].replace(i, '1')
        elif i.startswith('$10,000'):
            df[column] = df[column].replace(i, '2')
        elif i.startswith('$20,000'):
            df[column] = df[column].replace(i, '3')
        elif i.startswith('$30,000'):
            df[column] = df[column].replace(i, '4')
        elif i.startswith('$40,000'):
            df[column] = df[column].replace(i, '5')   
        elif i.startswith('$50,000'):
            df[column] = df[column].replace(i, '6')  
        elif i.startswith('$75,000'): 
            df[column] = df[column].replace(i, '7')
        elif i.startswith('$100,000'):
            df[column] = df[column].replace(i, '8') 
        elif i.startswith('$150,000'):
            df[column] = df[column].replace(i, '9')    
        elif i.startswith('DO'):
            df[column] = df[column].replace(i, '10')
        elif i.startswith('SKIPPED'):
            df[column] = df[column].replace(i, '11')
        elif i.startswith('REFUSED'):
            df[column] = df[column].replace(i, '12')
    return (df[column])

In [None]:
change_str_to_int(df, 'HHINCOME')

In [None]:
## Fix missing data issue
for i in range(len(df)):
    num = np.random.randint(0,100)
    if num > 100:
        df[i][np.random.randint(0,4)] = float("NaN")

print(df)


In [None]:
imputer = missingpy.MissForest()
all_data = imputer.fit_transform(df)

### we can see there are no more nan values
print(all_data)

In [None]:
## change numpy array to dataframe

df = pd.DataFrame(data=all_data, index=df.index, columns=df.columns)

## Decision Tree Code

In [None]:
# Function to split the dataset 
def splitdataset(df, illness = 'SOC5A'): 
  
    # Separating the target variable 
    X = df.drop(columns =['SOC5A', 'SOC5B', 'SOC5C', 'SOC5D', 'SOC5E']).values
    Y = df[illness].values
    
    # The variable X contains everything but responses to the question on mental health
    # The variable Y is the target variable, responses on anxiety levels

    # Splitting the dataset into train and test 
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 100) 
    
    # Splitting the dataset in a ratio of 80:20 between training and testing 
    # random_state variable = pseudo-random # generator used for random sampling
    
    return X, Y, X_train, X_test, y_train, y_test 

In [None]:
# Call function
X, y, X_train, X_test, y_train, y_test = splitdataset(df) 
# Fit the classifier with default hyper-parameters
clf = DecisionTreeClassifier(random_state=1234, max_depth=2)
model = clf.fit(X, y)

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(clf, 
                   feature_names=df.columns.drop(['SOC5A', 'SOC5B', 'SOC5C', 'SOC5D', 'SOC5E']),  
                   filled=True)

In [None]:
fig.savefig("decistion_tree.png")

In [None]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train) # trains algorithm on training data 

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Function to perform training with giniIndex. 
def train_using_gini(X_train, X_test, y_train): 
  
    # Creating the classifier object 
    clf_gini = DecisionTreeClassifier(criterion = "gini", 
            random_state = 100,max_depth=3, min_samples_leaf=5) 
  
    # Performing training 
    clf_gini.fit(X_train, y_train) 
    return clf_gini 

In [None]:
# Function to perform training with entropy. 
def train_using_entropy(X_train, X_test, y_train): 
  
    # Decision tree with entropy 
    clf_entropy = DecisionTreeClassifier( 
            criterion = "entropy", random_state = 100, 
            max_depth = 3, min_samples_leaf = 5) 
  
    # Performing training 
    clf_entropy.fit(X_train, y_train) 
    return clf_entropy 

In [None]:
# Function to make predictions 
def prediction(X_test, clf_object): 
  
    # Predicton on test with giniIndex 
    y_pred = clf_object.predict(X_test) 
    print("Predicted values:") 
    print(y_pred) 
    return y_pred

In [None]:
# Function to calculate accuracy 
def cal_accuracy(y_test, y_pred): 
      
    print("Confusion Matrix: ", 
        confusion_matrix(y_test, y_pred)) 
      
    print ("Accuracy : ", 
    accuracy_score(y_test,y_pred)*100) 
      
    print("Report : ", 
    classification_report(y_test, y_pred))

In [None]:
# Building Phase 
data = df
X, Y, X_train, X_test, y_train, y_test = splitdataset(data) 
clf_gini = train_using_gini(X_train, X_test, y_train) 
clf_entropy = train_using_entropy(X_train, X_test, y_train) 

# Operational Phase 
print("Results Using Gini Index:") 

# Prediction using gini 
y_pred_gini = prediction(X_test, clf_gini) 
cal_accuracy(y_test, y_pred_gini) 

print("Results Using Entropy:") 
# Prediction using entropy 
y_pred_entropy = prediction(X_test, clf_entropy) 
cal_accuracy(y_test, y_pred_entropy) 


## Random Forests Code

In [None]:
# Feature Scaling (Normalizing Data)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
forest = ExtraTreesClassifier(n_estimators=64,
                              random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

In [None]:
plt.figure()
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="b", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
plt.figure(figsize=(18,9))
plt.title("Feature Importances")
n=20
_ = plt.bar(range(n), importances[indices][:n], color="b", yerr=std[indices][:n])
plt.xticks(range(n), indices)
plt.xlim([-1, n])
plt.show()

In [None]:
## This line instantiates the model. 
rf = RandomForestClassifier(n_estimators=64) 
## Fits the model on my training data.
rf.fit(X_train, y_train) 
## And scores it on my testing data.
y_pred=rf.predict(X_test)

In [None]:
#Model Accuracy, how often is the classifier correct?

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)

In [None]:
def r2(rf, X_train, y_train):
    return r2_score(y_train, rf.predict(X_train))

perm_imp_rfpimp = permutation_importances(rf, X_train, y_train, r2)

In [None]:
pd.set_option('display.max_rows', None)
perm_imp_rfpimp