In [19]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import geopandas as gpd

In [20]:
#importing project specific functions to match changes in EDA notebook
from model_prep import get_feature_name, model_transformer_train, model_transformer_test

In [21]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import FunctionTransformer
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.utils.class_weight import compute_sample_weight

In [22]:
from dask import dataframe as dd
import joblib
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)
cluster.scheduler, cluster.workers

Perhaps you already have a cluster running?
Hosting the HTTP server on port 64584 instead


(<Scheduler: "tcp://127.0.0.1:64587" processes: 3 cores: 6>,
 {0: <Nanny: tcp://127.0.0.1:64605, threads: 2>,
  1: <Nanny: tcp://127.0.0.1:64604, threads: 2>,
  2: <Nanny: tcp://127.0.0.1:64610, threads: 2>})

In [23]:
model_data = pd.read_pickle('Data/model_data.pkl')

In [24]:
submit_X = pd.read_csv('Source_data/testset_values.csv')

In [25]:
model_data.columns

Index(['id', 'class', 'amount_tsh', 'funder', 'gps_height', 'installer',
       'longitude', 'latitude', 'basin', 'subvillage', 'region_code',
       'district_code', 'lga', 'ward', 'population', 'public_meeting',
       'scheme_management', 'scheme_name', 'permit', 'extraction_type',
       'management', 'management_group', 'payment_type', 'water_quality',
       'quantity', 'source', 'waterpoint_type', 'urban_rural', 'year', 'month',
       'years_old', 'popbins'],
      dtype='object')

In [26]:
model_data.drop(columns=['id'], inplace=True)

In [27]:
model_data_new = model_data.loc[model_data['years_old']<5]

In [28]:
num_cols = ['gps_height', 'population']
cat_cols = ['basin', 'region_code', 'district_code', 'extraction_type', 'payment_type', 'water_quality', 'quantity', 
            'source', 'management', 'management_group', 'waterpoint_type',  'funder', 'installer', 'subvillage', 
            'ward', 'scheme_management', 'scheme_name', 'popbins', 'lga', 'urban-rural']

In [29]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore')
CT = ColumnTransformer(remainder='passthrough', transformers=[('scaler', scaler, num_cols),
                                              ('ohe', ohe, cat_cols)], verbose=True, sparse_threshold=0)

In [30]:
def prep_train(unprepped_values, class_col):
    #splits the data into a train and test set ensuring that transformations are based only on the train set assuming we 
    #haven't seen the test set before
    train_X, test_X, train_y, test_y = train_test_split(unprepped_values.drop(columns=[class_col]), 
                                                        unprepped_values[class_col], test_size=0.25, random_state=42)
    train_Xct = CT.fit_transform(train_X)
    
    test_Xct = CT.transform(test_X)

    cols = get_feature_name(CT)
    train_Xf = pd.DataFrame(train_Xct, columns=cols)
    cols = get_feature_name(CT)
    test_Xf = pd.DataFrame(test_Xct, columns=cols)
    
    return train_Xf, train_y, test_Xf, test_y   

In [31]:
def prep_submit(train_data, test_data):
    #prepares the submission test set as we do our learning set, without refitting to the unseen data
            
    submit_Xt, id_col = model_transformer_test(train_data, test_data)
    submit_Xct = CT.transform(submit_Xt)

    cols = get_feature_name(CT)
    submit_Xf = pd.DataFrame(submit_Xct, columns=cols)
    
    return submit_Xf, cols, id_col

In [None]:
train_X, train_y, test_X, test_y = prep_train(model_data, 'class')

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators = 250, criterion='entropy', #max_samples=.75, #max_depth=16,  
                            class_weight='balanced_subsample', verbose=True)

with joblib.parallel_backend('dask'):
    rf.fit(train_X, train_y)
print(rf.score(test_X, test_y))

In [None]:
with joblib.parallel_backend('dask'):
    print(cross_val_score(rf, train_Xf, train_y, cv=5))

## Submission

In [None]:
values = pd.read_csv('Source_data/trainset_values.csv')
submit_X = pd.read_csv('Source_data/testset_values.csv')

In [None]:
submit_Xf, cols, id_col = prep_submit(values, submit_X)

In [None]:
submit_Xf

In [None]:
submit_pred_rf = rf.predict(submit_Xf)
submit_pred_rf = pd.DataFrame(submit_pred_rf, columns=['status_group'])
submit_pred_rf.insert(0,value=id_col, column='id')
submit_pred_rf

In [None]:
#submit_pred_rf.to_csv(r'submissions/D/submission_E_rf', index=False)