## 1. Loading required libraries

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

pd.set_option("display.max_columns", None)

from src.utils import MappingData
from sklearn.impute import SimpleImputer

## 2. Reading preprocessed data 

In [13]:
path = os.path.join("../data", "treatments_2017-2020_preprocessed")

df = pd.read_parquet(path + ".parquet")
print(df.shape)

(3638892, 83)


## 3. Building features and encoding categorical data 

encode ordinal variable using categoircal and factorize functions to ensure an order in the feature's categories

In [14]:
# AGE : The age of the client at Admission
categories = [
    "12-17 years old",
    "18-29 years old",
    "30-39 years old",
    "40-49 years old",
    "50-64 years old",
    "65+ years old",
]
df["AGE"] = pd.Categorical(df["AGE"], categories=categories, ordered=True)
labels, categories = pd.factorize(df["AGE"], sort=True)
df["AGE"] = labels


# EDUC : The client’s level of education
categories = [
    "No schooling",
    "Grades 9-11",
    "Grade 12",
    "1-3 years of college",
    "4 years of college, BA/BS, or more",
]
df["EDUC"] = pd.Categorical(df["EDUC"], categories=categories, ordered=True)
labels, categories = pd.factorize(df["EDUC"], sort=True)
df["EDUC"] = labels.astype(int)


# ARRESTS : Arrests of client made 30 days prior to admission.
categories = ["None", "Once", "Two or more times"]
df["ARRESTS"] = pd.Categorical(df["ARRESTS"], categories=categories, ordered=True)
labels, categories = pd.factorize(df["ARRESTS"], sort=True)
df["ARRESTS"] = labels


# DAYWAIT : Indicates the number of days from the first contact or request for a substance use
#  treatment service until the client was admitted and the first clinical substance use treatment service was provided.
categories = ["0-14 days", "15-30 days", "31+ days"]
df["DAYWAIT"] = pd.Categorical(df["DAYWAIT"], categories=categories, ordered=True)
labels, categories = pd.factorize(df["DAYWAIT"], sort=True)
df["DAYWAIT"] = labels


# FRSTUSE1 : this is the age of first intoxication
categories = [
    "11 years and under",
    "12-17 years",
    "18-24 years",
    "25-29 years",
    "30 years and older",
]
df["FRSTUSE1"] = pd.Categorical(df["FRSTUSE1"], categories=categories, ordered=True)
labels, categories = pd.factorize(df["FRSTUSE1"], sort=True)
df["FRSTUSE1"] = labels


# NUM_SUBS : no of substances taken by a client
categories = ["Zero sub", "One sub", "Two sub", "Three sub"]
df["NUM_SUBS"] = pd.Categorical(df["NUM_SUBS"], categories=categories, ordered=True)
labels, categories = pd.factorize(df["NUM_SUBS"], sort=True)
df["NUM_SUBS"] = labels

## 4. Selecting only thise columns which will be used further

In [15]:
# selecting only features which we will use further
columns_of_imp = [
    "AGE",
    "GENDER",
    "RACE",
    "ETHNIC",
    "MARSTAT",
    "EDUC",
    "EMPLOY",
    "PREG",
    "VET",
    "LIVARAG",
    "PRIMINC",
    "ARRESTS",
    "DIVISION",
    "SERVICES",
    "DAYWAIT",
    "PSOURCE",
    "SUB1",
    "FREQ1",
    "FRSTUSE1",
    "PSYPROB",
    "HLTHINS",
    "FREQ_ATND_SELF_HELP",
    "NOPRIOR",
    "NUM_SUBS",
]
df = df[columns_of_imp]

## 5. Dropping columns with > 50% missing values

In [16]:
# Dropping columns with more than 50% values
df_missing_perc = pd.DataFrame(df.isnull().sum() / df.shape[0] * 100).reset_index()
df_missing_perc.columns = ["Column", "% missing"]
print(df_missing_perc)
columns_50perc_missing = list(
    df_missing_perc[df_missing_perc["% missing"] > 50.0]["Column"].values
)
print(
    "Columns dropped due to too many missing values : {}".format(columns_50perc_missing)
)
df = df.drop(columns_50perc_missing, axis=1)

                 Column  % missing
0                   AGE   0.000000
1                GENDER   0.038693
2                  RACE   1.326805
3                ETHNIC   1.601367
4               MARSTAT  15.207569
5                  EDUC   0.000000
6                EMPLOY   1.554402
7                  PREG  66.711680
8                   VET   5.518878
9               LIVARAG   2.470615
10              PRIMINC  37.659898
11              ARRESTS   0.000000
12             DIVISION   0.000000
13             SERVICES   0.000000
14              DAYWAIT   0.000000
15              PSOURCE   1.588066
16                 SUB1   0.656271
17                FREQ1   6.244483
18             FRSTUSE1   0.000000
19              PSYPROB   9.670801
20              HLTHINS  57.529435
21  FREQ_ATND_SELF_HELP  14.572375
22              NOPRIOR   0.000000
23             NUM_SUBS   0.000000
Columns dropped due to too many missing values : ['PREG', 'HLTHINS']


## 6. Imputing missing values with SimpleImputer with mode as the filling criteria

In [17]:
# Imputing missing values
imputer = SimpleImputer(missing_values=None, strategy="most_frequent")
df_transformed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns).astype(
    df.dtypes.to_dict()
)

## 7. Saving the data which will be used for training

In [18]:
path_to_train = os.path.join("../data", "treatments_2017-2020" + "_train" + ".parquet")
df_transformed.to_parquet(path_to_train)