In [1]:
import dask.dataframe as dd
import pandas as pd
import statsmodels.api as sm
from dask.distributed import Client, progress
from dask_ml.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from xgboost import XGBClassifier
import joblib
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = dd.read_csv('D:/DSF Files/creditcard.csv', dtype={'Time':'float64'})

In [3]:
y = df['Class']
X = df.drop(['Class'], 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=357)

In [4]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit='4GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:61985  Dashboard: http://127.0.0.1:61984/status,Cluster  Workers: 4  Cores: 8  Memory: 16.00 GB


In [5]:
with joblib.parallel_backend('dask'):
    lr = LinearRegression()
    scores = cross_validate(lr, X_train.compute(), y_train.compute(), cv=4)
scores

{'fit_time': array([0.10674572, 0.16156816, 0.14261746, 0.19348311]),
 'score_time': array([0.00398898, 0.00598407, 0.00598454, 0.00598359]),
 'test_score': array([0.37545771, 0.47791122, 0.60539818, 0.4306252 ])}

In [6]:
with joblib.parallel_backend('dask'):
    lr = Lasso()
    scores = cross_validate(lr, X_train.compute(), y_train.compute(), cv=4)
scores

{'fit_time': array([0.12563133, 0.08477569, 0.07480049, 0.10372233]),
 'score_time': array([0.00498676, 0.00897431, 0.00797772, 0.0069809 ]),
 'test_score': array([-1.00623182e-04, -9.15327357e-04,  5.81852836e-05, -8.30178175e-05])}

In [7]:
with joblib.parallel_backend('dask'):
    lr = Ridge()
    scores = cross_validate(lr, X_train.compute(), y_train.compute(), cv=4)
scores

{'fit_time': array([0.08380485, 0.08319092, 0.10970616, 0.10771108]),
 'score_time': array([0.00498748, 0.00598335, 0.00698233, 0.00498724]),
 'test_score': array([0.37546822, 0.47790996, 0.60539799, 0.4306239 ])}

In [8]:
with joblib.parallel_backend('dask'):
    lr = ElasticNet()
    scores = cross_validate(lr, X_train.compute(), y_train.compute(), cv=4)
scores

{'fit_time': array([0.08038926, 0.14065266, 0.11370063, 0.13666296]),
 'score_time': array([0.01495957, 0.00748801, 0.00797868, 0.00649977]),
 'test_score': array([-8.44547699e-05, -9.25715833e-04,  5.87758425e-05, -1.00867844e-04])}

In [9]:
with joblib.parallel_backend('dask'):
    lr = LogisticRegression(solver='lbfgs', penalty='l2', max_iter=2000)
    scores = cross_validate(lr, X_train.compute(), y_train.compute(), cv=4)
scores

{'fit_time': array([18.4008255 , 42.57190895, 10.48074102, 21.53142738]),
 'score_time': array([0.00997281, 0.00698066, 0.01396275, 0.02293992]),
 'test_score': array([0.98668675, 0.99894894, 0.99898397, 0.99882631])}