## TASK 5

## IMPORT THE NECESSARY LIBRARIES 

In [15]:
import dask.dataframe as dd
from sklearn.ensemble import RandomForestClassifier
from dask_ml.metrics import accuracy_score
from dask_ml.preprocessing import LabelEncoder
import dask.array as da
from dask_ml.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from dask_ml.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd

## LOAD THE DATASET DOWNLOADED FROM KAGGLE

In [2]:
df = dd.read_csv("creditcard.csv", dtype={'Time': 'float64'})

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## DATA PREPROCESSING (HANDLING MISSING VALUES OR SHAPE DATA FOR MODEL)

In [4]:
row_count = len(df)
row_count

284807

In [5]:
sc = StandardScaler()
df['NormalizedAmount'] = df['Amount'].map_partitions(lambda part: sc.fit_transform(part.values.reshape(-1,1)).ravel())
df = df.drop('Amount', axis=1)

In [6]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,NormalizedAmount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,0.239204
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.357028
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.168633
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,0.133211
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,-0.083929


## SPLITTING THE DATA INTO TRAIN AND TEST ALSO USING OVERSAMPLING BY SMOTE

In [7]:
X = df.drop('Class', axis=1).compute()
Y = df['Class'].compute()

In [8]:
smote = SMOTE(random_state=0)

In [9]:
X_resampled, y_resampled = smote.fit_resample(X, Y)

## CHANGE BACK INTO DASK DATAFRAME AS I AM USING THAT INSTEAD OF PANDAS

In [10]:
df_resampled = dd.from_pandas(pd.concat([X_resampled, y_resampled], axis=1), npartitions=10)

In [11]:
X = df_resampled.drop('Class', axis=1)
Y = df_resampled['Class']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)



## TRAIN THE MODEL

In [13]:
classifier = LogisticRegression()
classifier.fit(X_train.values.compute(), y_train.values.compute())

In [17]:
y_pred = classifier.predict(X_test.values.compute())

In [18]:
y_pred

array([False, False, False, ...,  True,  True,  True])

## ACCURACY OF MODEL

In [19]:
print("Model Accuracy: ", accuracy_score(y_test.values.compute(), y_pred)*100)

Model Accuracy:  97.96160407562337


## F1 SCORE

In [20]:
print(f1_score(y_test.values.compute(), y_pred)*100)

97.93797982305279
