# Mental Illness Disparities in Vets

## Importing required packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost import XGBRegressor

## Getting the dataset

In [2]:
df = pd.read_csv("data/proj70/comma-separated-values-file-1.csv")
df.head()

Unnamed: 0,index,Vulnerable_population,Section,Long_title,Short_title,Group,Group1,Subgroup,Mental Illness,Value
0,0,Serious Mental Illness,A. Socio-demographic Characteristics,A1. Distribution of SMI among FY13 Veteran VHA...,SMI (Overall),Overall,,Overall,Mood.Anxiety,1149541.0
1,1,Serious Mental Illness,A. Socio-demographic Characteristics,A2. Distribution of SMI among FY13 Veteran VHA...,SMI by Sex,Sex,,F,Mood.Anxiety,125993.0
2,2,Serious Mental Illness,A. Socio-demographic Characteristics,A2. Distribution of SMI among FY13 Veteran VHA...,SMI by Sex,Sex,,M,Mood.Anxiety,1023546.0
3,3,Serious Mental Illness,A. Socio-demographic Characteristics,A3. Distribution of SMI among FY13 Veteran VHA...,SMI by Age,Age,,18-44,Mood.Anxiety,266131.0
4,4,Serious Mental Illness,A. Socio-demographic Characteristics,A3. Distribution of SMI among FY13 Veteran VHA...,SMI by Age,Age,,45-64,Mood.Anxiety,529733.0


## Cleaning dataset

In [3]:
df.isna().sum()

index                      0
Vulnerable_population      0
Section                    0
Long_title                 0
Short_title                0
Group                      0
Group1                   228
Subgroup                   0
Mental Illness             0
Value                     61
dtype: int64

In [4]:
df["Value"].fillna(np.mean(df["Value"]), inplace=True)
df.isna().sum()

index                      0
Vulnerable_population      0
Section                    0
Long_title                 0
Short_title                0
Group                      0
Group1                   228
Subgroup                   0
Mental Illness             0
Value                      0
dtype: int64

## Pre-Processing data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1656 entries, 0 to 1655
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  1656 non-null   int64  
 1   Vulnerable_population  1656 non-null   object 
 2   Section                1656 non-null   object 
 3   Long_title             1656 non-null   object 
 4   Short_title            1656 non-null   object 
 5   Group                  1656 non-null   object 
 6   Group1                 1428 non-null   object 
 7   Subgroup               1656 non-null   object 
 8   Mental Illness         1656 non-null   object 
 9   Value                  1656 non-null   float64
dtypes: float64(1), int64(1), object(8)
memory usage: 129.5+ KB


In [16]:
for column in df.keys():
    if pd.api.types.is_object_dtype(df[column]):
        le = LabelEncoder()
        print("-"*20)
        print(column)
        df[column] = le.fit_transform(df[column])
        print(le.classes_)
        print("-"*20)
    else:
        pass

--------------------
Vulnerable_population
['Serious Mental Illness']
--------------------
--------------------
Section
['A. Socio-demographic Characteristics' 'B. VHA Outpatient Utilization'
 'C. Diagnoses']
--------------------
--------------------
Long_title
['A1. Distribution of SMI among FY13 Veteran VHA Patients'
 'A2. Distribution of SMI among FY13 Veteran VHA Patients by Sex, FY13'
 'A3. Distribution of SMI among FY13 Veteran VHA Patients by Age, FY13'
 'A4. Distribution of SMI among FY13 Veteran VHA Patients by Race/Ethnicity, FY13'
 'A5. Distribution of SMI among FY13 Veteran VHA Patients by Rural/Urban Status, FY13'
 'A6. Distribution of SMI among FY13 Veteran VHA Patients by Service-Connected Status, FY13'
 'B1. Distribution of SMI among FY13 Veteran VHA Patients by VHA Outpatient Visits, FY13'
 'B2. Distribution of SMI among FY13 Veteran VHA Patients by Primary Care Visits, FY13'
 'B3. Distribution of SMI among FY13 Veteran VHA Patients by Mental Health/Substance Use Disor

In [17]:
df.head()

Unnamed: 0,index,Vulnerable_population,Section,Long_title,Short_title,Group,Group1,Subgroup,Mental Illness,Value
0,0,0,0,0,0,5,23,174,0,1149541.0
1,1,0,0,1,11,8,23,102,0,125993.0
2,2,0,0,1,11,8,23,146,0,1023546.0
3,3,0,0,2,1,0,23,3,0,266131.0
4,4,0,0,2,1,0,23,6,0,529733.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1656 entries, 0 to 1655
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  1656 non-null   int64  
 1   Vulnerable_population  1656 non-null   int64  
 2   Section                1656 non-null   int64  
 3   Long_title             1656 non-null   int64  
 4   Short_title            1656 non-null   int64  
 5   Group                  1656 non-null   int64  
 6   Group1                 1656 non-null   int64  
 7   Subgroup               1656 non-null   int64  
 8   Mental Illness         1656 non-null   int64  
 9   Value                  1656 non-null   float64
dtypes: float64(1), int64(9)
memory usage: 129.5 KB


In [19]:
df.drop("index", inplace=True, axis=1)

In [20]:
X = df.drop("Value", axis=1)
y = df["Value"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Training the model

In [21]:
rreg = RandomForestRegressor()
rreg.fit(X_train, y_train)
rreg.score(X_test, y_test)

0.6038819364385503

In [22]:
xreg = XGBRegressor().fit(X_train.values, y_train.values)
xreg.score(X_test.values, y_test.values)

0.580323411967071