<a href="https://colab.research.google.com/github/CateMerfeld/class_work/blob/main/dask_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install dask distributed --upgrade
import warnings
warnings.filterwarnings("ignore")

from dask.distributed import Client, progress

client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client

Requirement already up-to-date: dask in /usr/local/lib/python3.6/dist-packages (2021.1.0)
Requirement already up-to-date: distributed in /usr/local/lib/python3.6/dist-packages (2021.1.0)


0,1
Client  Scheduler: tcp://127.0.0.1:41249  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


In [2]:
!pip install aiohttp --quiet
!pip install requests
!pip install "dask[dataframe]" --upgrade
import dask.dataframe as dd

Requirement already up-to-date: dask[dataframe] in /usr/local/lib/python3.6/dist-packages (2021.1.0)


In [3]:
!pip install dask-ml
import dask_ml
from dask_ml.model_selection import train_test_split



In [4]:
# load data into dask DataFrame
df = dd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Absenteeism_at_work.csv', sep=';')

In [5]:
df.compute().shape

(740, 21)

In [6]:
df.head()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239.554,97,0,1,2,1,0,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239.554,97,1,1,1,1,0,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239.554,97,0,1,0,1,0,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239.554,97,0,1,2,1,1,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239.554,97,0,1,2,1,0,1,90,172,30,2


In [7]:
df.describe().compute()

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,Hit target,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
count,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0
mean,18.017568,19.216216,6.324324,3.914865,2.544595,221.32973,29.631081,12.554054,36.45,271.490235,94.587838,0.054054,1.291892,1.018919,0.567568,0.072973,0.745946,79.035135,172.114865,26.677027,6.924324
std,11.021247,8.433406,3.436287,1.421675,1.111831,66.952223,14.836788,4.384873,6.478772,39.058116,3.779313,0.226277,0.673238,1.098489,0.495749,0.260268,1.318258,12.883211,6.034995,4.285452,13.330998
min,1.0,0.0,0.0,2.0,1.0,118.0,5.0,1.0,27.0,205.917,81.0,0.0,1.0,0.0,0.0,0.0,0.0,56.0,163.0,19.0,0.0
25%,9.0,13.0,3.0,3.0,2.0,179.0,16.0,9.0,31.0,244.387,93.0,0.0,1.0,0.0,0.0,0.0,0.0,69.0,169.0,24.0,2.0
50%,18.0,23.0,6.0,4.0,3.0,225.0,26.0,13.0,37.0,264.249,95.0,0.0,1.0,1.0,1.0,0.0,0.0,83.0,170.0,25.0,3.0
75%,28.0,26.0,9.0,5.0,4.0,260.0,50.0,16.0,40.0,294.217,97.0,0.0,1.0,2.0,1.0,0.0,1.0,89.0,172.0,31.0,8.0
max,36.0,28.0,12.0,6.0,4.0,388.0,52.0,29.0,58.0,378.884,100.0,1.0,4.0,4.0,1.0,1.0,8.0,108.0,196.0,38.0,120.0


In [8]:
df.dtypes

ID                                   int64
Reason for absence                   int64
Month of absence                     int64
Day of the week                      int64
Seasons                              int64
Transportation expense               int64
Distance from Residence to Work      int64
Service time                         int64
Age                                  int64
Work load Average/day              float64
Hit target                           int64
Disciplinary failure                 int64
Education                            int64
Son                                  int64
Social drinker                       int64
Social smoker                        int64
Pet                                  int64
Weight                               int64
Height                               int64
Body mass index                      int64
Absenteeism time in hours            int64
dtype: object

In [9]:
# since this df isn't too large, I'll save it to RAM to speed up processing
df = df.persist()

In [10]:
# function to check percent null values in each column. 
def checking_missing_pct(df, num_rows):
  max_ptc = num_rows * .2
  for i, col in enumerate(df.columns):
    num_missing = df[col].isnull().sum().compute()
    pct_missing = num_missing/num_rows
    # if column has more than 20% missing values, print column name and pct missing
    if pct_missing > .2:
      print(col, pct_missing)
    else:
      print(col, 'within limit')

checking_missing_pct(df, 20052)

ID within limit
Reason for absence within limit
Month of absence within limit
Day of the week within limit
Seasons within limit
Transportation expense within limit
Distance from Residence to Work within limit
Service time within limit
Age within limit
Work load Average/day  within limit
Hit target within limit
Disciplinary failure within limit
Education within limit
Son within limit
Social drinker within limit
Social smoker within limit
Pet within limit
Weight within limit
Height within limit
Body mass index within limit
Absenteeism time in hours within limit


In [11]:
# there were no columns above the 20% threshold, so dropping any remaining nulls
df = df.dropna()

In [12]:
# drop 'ID' column since each observation will have a unique value here
df = df.drop(['ID'], axis=1)

In [24]:
# make the outcome variable binary, above avg absenteeism vs below avg
df['Absenteeism'] = df['Absenteeism time in hours'].apply(lambda x: 1 if x > 7 else 0, meta=('Absenteeism', 'int64'))

In [25]:
# train test split using Dask-ml's function
X = df.drop(['Absenteeism time in hours', 'Absenteeism'], axis=1)
y = df['Absenteeism']


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)

In [26]:
# minmax scale using the same process as before
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_sc = scaler.fit_transform(X_train)
test_sc = scaler.transform(X_test)

In [29]:
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.model_selection import cross_validate, GridSearchCV

rf = RandomForestClassifier()

# distribute training with Joblib
with joblib.parallel_backend('dask'):
  scores = cross_validate(rf, X_train.compute(), y_train.compute(), cv=4)

scores

{'fit_time': array([1.91589475, 1.91593027, 1.55776024, 1.90743494]),
 'score_time': array([0.12238526, 0.11858606, 0.22496557, 0.13659072]),
 'test_score': array([0.82191781, 0.79452055, 0.75172414, 0.72413793])}

In [38]:
from sklearn.metrics import roc_auc_score
rf_params = {'max_depth':[2,4,10,15]}

rf = RandomForestClassifier()

grid_search_rf = GridSearchCV(rf,
                              param_grid = rf_params,
                              return_train_score=True,
                              cv=4,
                              n_jobs=-1,
                              scoring='roc_auc')

In [39]:
with joblib.parallel_backend('dask'):
  grid_search_rf.fit(X_train.compute(), y_train.compute())

print('The best score is: {} at {}'.format(grid_search_rf.score(X_test.compute(),y_test.compute()), grid_search_rf.best_params_))

The best score is: 0.9080615176609769 at {'max_depth': 10}
