In [7]:
# Step 1: Import Libraries and Load the Dataset
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [8]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
0,29,1,4.745402,paypal,28.204861,0
1,725,1,4.742303,storecredit,0.0,0
2,845,1,4.921318,creditcard,0.0,0
3,503,1,4.886641,creditcard,0.0,0
4,2000,1,5.040929,creditcard,0.0,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39221 entries, 0 to 39220
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   accountAgeDays        39221 non-null  int64  
 1   numItems              39221 non-null  int64  
 2   localTime             39221 non-null  float64
 3   paymentMethod         39221 non-null  object 
 4   paymentMethodAgeDays  39221 non-null  float64
 5   label                 39221 non-null  int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 1.8+ MB


In [10]:
#Detect missing values or NAN for an array-like object.
df.isnull().sum()

accountAgeDays          0
numItems                0
localTime               0
paymentMethod           0
paymentMethodAgeDays    0
label                   0
dtype: int64

In [11]:
df.describe(include='all')

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
count,39221.0,39221.0,39221.0,39221,39221.0,39221.0
unique,,,,3,,
top,,,,creditcard,,
freq,,,,28004,,
mean,857.563984,1.084751,4.748232,,122.641326,0.014278
std,804.788212,0.566899,0.38936,,283.569177,0.118636
min,1.0,1.0,0.421214,,0.0,0.0
25%,72.0,1.0,4.742303,,0.0,0.0
50%,603.0,1.0,4.886641,,0.0125,0.0
75%,1804.0,1.0,4.962055,,87.510417,0.0


In [12]:
# Step 3: Encode Categorical Variables
from sklearn.preprocessing import LabelEncoder

#transforming string from payment method into float values
label_encoder = LabelEncoder()
df['paymentMethod'] = label_encoder.fit_transform(df['paymentMethod'])
df

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
0,29,1,4.745402,1,28.204861,0
1,725,1,4.742303,2,0.000000,0
2,845,1,4.921318,0,0.000000,0
3,503,1,4.886641,0,0.000000,0
4,2000,1,5.040929,0,0.000000,0
...,...,...,...,...,...,...
39216,986,1,4.836982,0,0.000000,0
39217,1647,1,4.876771,0,377.930556,0
39218,1591,1,4.742303,0,0.000000,0
39219,237,1,4.921318,0,236.082639,0


In [13]:
from sklearn.model_selection import train_test_split

# Split data up into train and test sets
X = df.drop("label", axis=1)
y = df['label']

In [14]:
#The test_size parameter is set to 0.33, which means that 33% of the data is reserved for testing,and the remaining 68% is used
#for training.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix


logestic_regression = LogisticRegression()
clf = logestic_regression.fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)
print(accuracy_score(y_pred, y_test))

1.0


In [17]:
# Compare test set predictions with ground truth labels

# Evalution
accuracy_linearR = accuracy_score(y_test, y_pred)
confusion_matrix_linearR = confusion_matrix(y_test, y_pred)

print("Accuracy : ", accuracy_linearR)
print("Confusion Matrix...\n", confusion_matrix_linearR)

Accuracy :  1.0
Confusion Matrix...
 [[12753     0]
 [    0   190]]


In [18]:
# 2.Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

y_pred_dt = decision_tree.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
confusion_matrix_dt = confusion_matrix(y_test, y_pred_dt)

print("Accuracy : ", accuracy_dt)
print("Confusion Matrix...\n", confusion_matrix_dt)

Accuracy :  1.0
Confusion Matrix...
 [[12753     0]
 [    0   190]]


In [19]:
# 3.Random Forest

from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

y_pred_rf = random_forest.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print("Accuracy : ", accuracy_rf)
print("Confusion Matrix...\n", confusion_matrix_rf)

Accuracy :  1.0
Confusion Matrix...
 [[12753     0]
 [    0   190]]
