In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder,MinMaxScaler
from sklearn.linear_model import LogisticRegression
import numpy as np

In [2]:
def group_pivot(labelgroup, yvalue, dataset, SZOnly = False):
    if SZOnly:
        dataset = dataset.loc[dataset.Diagnosis == 0]
    
    grouped = (dataset.groupby([labelgroup])[yvalue].value_counts(normalize = True).rename('percentage').reset_index())
    pivot = pd.pivot_table(grouped, index = labelgroup, columns = yvalue, values = 'percentage', aggfun = 'sum')
    return pivot

In [3]:
data = pd.read_csv('/Users/dahaixing/Documents/Coursework/DeepLearning/MLCW/MS4S16_Dataset.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Diagnosis           5000 non-null   int64  
 1   Anhedonia           5000 non-null   float64
 2   Apathy              5000 non-null   float64
 3   Appetite            5000 non-null   float64
 4   Concentration       5000 non-null   float64
 5   Content             5000 non-null   float64
 6   Delay               5000 non-null   object 
 7   Delusion            5000 non-null   float64
 8   Dep_Mood            5000 non-null   float64
 9   Focus               5000 non-null   float64
 10  Hallucination       5000 non-null   float64
 11  Housing             5000 non-null   object 
 12  Intrusive_Thoughts  4170 non-null   float64
 13  Participant         5000 non-null   int64  
 14  Passive             5000 non-null   float64
 15  Pregnant            2762 non-null   float64
 16  Psycho

In [5]:
def data_impu(data):
    data.fillna(data.mean(), inplace=True)
    mean_tired = data['Tired'].replace(['NaN','inf'], pd.np.nan).astype(float).mean()
    mean_tired = data['Tired'].replace([pd.np.nan, pd.np.inf], pd.np.nan).mean()
    data['Tired'].replace(['NaN','inf'], mean_tired)
    data.replace([pd.np.nan, pd.np.inf], [mean_tired, mean_tired], inplace=True)
    return data

def data_remove(data):
    data.fillna(data.mean(), inplace=True)
    data = data.replace([np.inf, -np.inf], np.nan)
    data = data.dropna()
    return data
    

In [6]:
data = data_remove(data)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4933 entries, 0 to 4999
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Diagnosis           4933 non-null   int64  
 1   Anhedonia           4933 non-null   float64
 2   Apathy              4933 non-null   float64
 3   Appetite            4933 non-null   float64
 4   Concentration       4933 non-null   float64
 5   Content             4933 non-null   float64
 6   Delay               4933 non-null   object 
 7   Delusion            4933 non-null   float64
 8   Dep_Mood            4933 non-null   float64
 9   Focus               4933 non-null   float64
 10  Hallucination       4933 non-null   float64
 11  Housing             4933 non-null   object 
 12  Intrusive_Thoughts  4933 non-null   float64
 13  Participant         4933 non-null   int64  
 14  Passive             4933 non-null   float64
 15  Pregnant            4933 non-null   float64
 16  Psycho

  data.fillna(data.mean(), inplace=True)


In [7]:
data.describe()

Unnamed: 0,Diagnosis,Anhedonia,Apathy,Appetite,Concentration,Content,Delusion,Dep_Mood,Focus,Hallucination,...,Pregnant,Psychomotor,Rumination,Sleep,Stress,Suspicious,Tension,Tired,Unusual_Thought,Withdrawal
count,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,...,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0,4933.0
mean,0.504561,6.503788,2.474579,27.055055,6.516107,0.279476,2.634954,5.704499,6.516107,64.269048,...,0.101669,4.680909,5.688516,7.010081,4.915619,2.754035,4.921434,5.515099,2.478906,3.958419
std,0.50003,1.4893,1.730684,14.175538,1.477503,0.833005,1.439073,3.297262,1.477503,219.312738,...,0.224995,1.483437,2.1648,1.411643,2.221614,0.968984,1.962481,1.490296,1.410873,1.468565
min,0.0,1.098854,-3.211011,0.141074,1.299964,0.000187,-2.127037,0.0,1.299964,0.02735,...,0.0,-0.024974,-0.409032,2.144726,-3.257788,-2.346238,-2.183456,0.36665,-1.981307,-0.825919
25%,0.0,5.49198,1.262008,16.674792,5.524455,0.018579,1.626144,4.63551,5.524455,4.104303,...,0.0,3.694768,4.045282,6.057429,3.438019,2.754909,3.565728,4.474606,1.483585,2.969657
50%,1.0,6.485558,2.427466,25.13785,6.491856,0.064289,2.556736,6.744049,6.491856,12.731402,...,0.101376,4.718395,5.524415,6.979529,5.09786,2.754909,5.254071,5.484966,2.390202,3.960479
75%,1.0,7.489204,3.633896,35.46484,7.517627,0.216074,3.587206,8.042209,7.517627,41.776438,...,0.101376,5.682734,7.278322,7.972515,6.535836,2.754909,6.384132,6.523405,3.426193,4.977593
max,1.0,11.60314,8.803433,98.888708,11.649649,21.001327,8.978785,12.00355,11.649649,6287.163151,...,1.0,10.17154,12.009666,11.920312,11.970952,8.212275,9.622076,11.454125,8.066822,9.022207


In [8]:
features = [
 'Anhedonia',
 'Apathy',
 'Appetite',
 'Concentration',
 'Content',
 'Delay',
 'Delusion',
 'Dep_Mood',
 'Focus',
 'Hallucination',
 'Housing',
 'Intrusive_Thoughts',
 'Participant',
 'Passive',
 'Pregnant',
 'Psychomotor',
 'Race',
 'Rumination',
 'Sex',
 'Sleep',
 'Stress',
 'Suspicious',
 'Tension',
 'Tired',
 'Unusual_Thought',
 'Withdrawal']

In [9]:
#split the training and testing datasets
y = data['Diagnosis']
X = data[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
X_test_save = X_test


In [10]:
X_train.describe()

Unnamed: 0,Anhedonia,Apathy,Appetite,Concentration,Content,Delusion,Dep_Mood,Focus,Hallucination,Intrusive_Thoughts,...,Pregnant,Psychomotor,Rumination,Sleep,Stress,Suspicious,Tension,Tired,Unusual_Thought,Withdrawal
count,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,...,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0,3453.0
mean,6.508159,2.47385,27.152071,6.520994,0.272758,2.622172,5.694293,6.520994,61.938314,5.698519,...,0.103251,4.668453,5.681644,6.993207,4.928099,2.759801,4.944887,5.517175,2.472389,3.965059
std,1.500864,1.720799,14.286171,1.479086,0.803836,1.435486,3.281672,1.479086,201.431537,2.194117,...,0.227751,1.469691,2.173074,1.433704,2.221059,0.961567,1.962865,1.491496,1.40589,1.475909
min,1.098854,-3.060709,0.141074,1.299964,0.000187,-2.127037,0.0,1.299964,0.02735,-1.386416,...,0.0,-0.024974,-0.409032,2.144726,-2.203737,-2.346238,-2.183456,0.36665,-1.981307,-0.825919
25%,5.476002,1.262008,16.898211,5.523947,0.018458,1.616982,4.63551,5.523947,4.099454,4.251173,...,0.0,3.683988,4.020519,6.038201,3.428192,2.754909,3.585668,4.492153,1.486916,2.989708
50%,6.501239,2.423307,25.373618,6.495675,0.063263,2.549796,6.736789,6.495675,12.598337,5.702358,...,0.101376,4.70862,5.518575,6.951166,5.123498,2.754909,5.275865,5.472242,2.38198,3.951111
75%,7.49783,3.633896,35.485982,7.546283,0.210712,3.5751,8.021104,7.546283,40.627731,6.993309,...,0.101376,5.673982,7.290906,7.969119,6.542947,2.754909,6.402672,6.533133,3.408044,4.990135
max,11.60314,8.803433,98.888708,10.889753,21.001327,8.978785,12.00355,10.889753,4445.164808,13.209009,...,1.0,9.824838,12.009666,11.161153,11.219529,8.212275,9.622076,11.454125,8.066822,9.022207


In [11]:

categories = ['Sex','Race','Housing','Delay']

def onehot(data, categories = categories):
    ordinalencoder = OneHotEncoder()
    onehot = ordinalencoder.fit_transform(data[categories])
    columns = []
    for i, values in enumerate(ordinalencoder.categories_):
        for j in values:
            columns. append(categories[i]+'-'+j)
    return pd.DataFrame(onehot.toarray(), columns = columns)




    


In [12]:
#X_train = X_train.join(onehot(X_train))
#X_test = X_test.join(onehot(X_test))
X_train = pd.get_dummies(X_train, columns = categories)
X_test = pd.get_dummies(X_test, columns = categories)
X_train.head()

Unnamed: 0,Anhedonia,Apathy,Appetite,Concentration,Content,Delusion,Dep_Mood,Focus,Hallucination,Intrusive_Thoughts,...,Sex_Female,Sex_Male,Race_Asian,Race_Black,Race_Hispanic,Race_White,Housing_Stable,Housing_Unstable,Delay_No,Delay_Yes
3652,7.341979,0.980252,89.743512,6.770271,0.032452,1.132862,9.250239,6.770271,4.392373,5.702358,...,1,0,0,0,0,1,1,0,1,0
2340,4.900697,0.783923,10.580918,6.71669,0.025613,2.854473,0.0,6.71669,57.107922,6.219899,...,1,0,0,1,0,0,1,0,0,1
2517,8.973571,3.46596,14.649513,6.203595,0.450651,1.751069,8.251145,6.203595,29.377272,5.702358,...,1,0,0,0,1,0,1,0,0,1
4214,6.622166,0.920721,48.029752,5.805059,0.178341,0.719298,8.154978,5.805059,2.621504,7.00255,...,1,0,0,1,0,0,1,0,0,1
2444,7.273105,2.454355,43.48695,8.156776,0.522155,3.51547,4.818326,8.156776,25.627262,4.061889,...,0,1,0,0,0,1,1,0,0,1


In [13]:
#training 

model = LogisticRegression(penalty='elasticnet', max_iter= 1000, solver= 'saga', l1_ratio=1)

model.fit(X_train, y_train)

test_predict = model.predict(X_test)



In [14]:
print('test accuracy:', skm.accuracy_score(y_test, test_predict))

test accuracy: 0.9


In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_predict, labels=[1, 0]))

              precision    recall  f1-score   support

           1       0.91      0.89      0.90       744
           0       0.89      0.91      0.90       736

    accuracy                           0.90      1480
   macro avg       0.90      0.90      0.90      1480
weighted avg       0.90      0.90      0.90      1480



In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, test_predict, labels=[1, 0])
print(confusion_matrix)

[[661  83]
 [ 65 671]]


In [17]:
import fairlearn
from fairlearn.metrics import MetricFrame

In [18]:
def race_fp(truelabels, predictions):
  sesitive = X_test_save.Race
  fmetrics = MetricFrame(metrics= fairlearn.metrics.false_positive_rate, 
                         y_true=truelabels, 
                         y_pred=predictions,
                         sensitive_features=sesitive)
  results = pd.DataFrame([fmetrics.by_group, fmetrics.by_group/fmetrics.by_group.White], 
                         index= ['FPR', 'FPR Parity'])
  return results

In [19]:
race_fp(y_test, test_predict)

Race,Asian,Black,Hispanic,White
FPR,0.057971,0.135135,0.176991,0.04
FPR Parity,1.449275,3.378378,4.424779,1.0


In [20]:
def raceNsex_fp(truelabels, predictions):
  sesitive = pd.DataFrame(np.stack([X_test_save.Race, X_test_save.Sex], axis = 1),
                          columns = ['Race','Sex']) 
  fmetrics = MetricFrame(metrics= fairlearn.metrics.false_positive_rate, 
                         y_true=truelabels, 
                         y_pred=predictions,
                         sensitive_features=sesitive)
  results = pd.DataFrame([fmetrics.by_group, fmetrics.by_group/fmetrics.by_group.White.Male], 
                         index= ['FPR', 'FPR Parity'])
  return results

In [21]:
raceNsex_fp(y_test, test_predict)

Race,Asian,Asian,Black,Black,Hispanic,Hispanic,White,White
Sex,Female,Male,Female,Male,Female,Male,Female,Male
FPR,0.059524,0.055556,0.145695,0.088235,0.149254,0.217391,0.026596,0.0625
FPR Parity,0.952381,0.888889,2.331126,1.411765,2.38806,3.478261,0.425532,1.0


In [22]:
def race_fn(truelabels, predictions):
  sesitive = X_test_save.Race
  fmetrics = MetricFrame(metrics= fairlearn.metrics.false_negative_rate, 
                         y_true=truelabels, 
                         y_pred=predictions,
                         sensitive_features=sesitive)
  results = pd.DataFrame([fmetrics.by_group, fmetrics.by_group/fmetrics.by_group.White], 
                         index= ['FNR', 'FNR Parity'])
  return results

In [23]:
race_fn(y_test, test_predict)

Race,Asian,Black,Hispanic,White
FNR,0.123457,0.092063,0.153846,0.116732
FNR Parity,1.057613,0.788677,1.317949,1.0


In [24]:
def raceNsex_fn(truelabels, predictions):
  sesitive = pd.DataFrame(np.stack([X_test_save.Race, X_test_save.Sex], axis = 1),
                          columns = ['Race','Sex']) 
  fmetrics = MetricFrame(metrics= fairlearn.metrics.false_negative_rate, 
                         y_true=truelabels, 
                         y_pred=predictions,
                         sensitive_features=sesitive)
  results = pd.DataFrame([fmetrics.by_group, fmetrics.by_group/fmetrics.by_group.White.Male], 
                         index= ['FNR', 'FNR Parity'])
  return results

In [25]:
raceNsex_fn(y_test, test_predict)

Race,Asian,Asian,Black,Black,Hispanic,Hispanic,White,White
Sex,Female,Male,Female,Male,Female,Male,Female,Male
FNR,0.1875,0.030303,0.127451,0.075117,0.204082,0.095238,0.176923,0.055118
FNR Parity,3.401786,0.549784,2.312325,1.362844,3.702624,1.727891,3.20989,1.0
