
Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Importing Dataset

In [2]:
fake_currency = pd.read_csv('/content/data_banknote_authentication.csv')

In [3]:
fake_currency.shape

(1371, 5)

In [7]:
fake_currency.columns = ['var', 'skew', 'curt', 'entr', 'auth']
print(fake_currency.head())

       var    skew    curt     entr  auth
0  4.54590  8.1674 -2.4586 -1.46210     0
1  3.86600 -2.6383  1.9242  0.10645     0
2  3.45660  9.5228 -4.0112 -3.59440     0
3  0.32924 -4.4552  4.5718 -0.98880     0
4  4.36840  9.6718 -3.9606 -3.16250     0


In [8]:
fake_currency.tail()

Unnamed: 0,var,skew,curt,entr,auth
1366,0.40614,1.3492,-1.4501,-0.55949,1
1367,-1.3887,-4.8773,6.4774,0.34179,1
1368,-3.7503,-13.4586,17.5932,-2.7771,1
1369,-3.5637,-8.3827,12.393,-1.2823,1
1370,-2.5419,-0.65804,2.6842,1.1952,1


In [9]:
# dataset informations
fake_currency.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371 entries, 0 to 1370
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   var     1371 non-null   float64
 1   skew    1371 non-null   float64
 2   curt    1371 non-null   float64
 3   entr    1371 non-null   float64
 4   auth    1371 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [10]:
fake_currency.describe()

Unnamed: 0,var,skew,curt,entr,auth
count,1371.0,1371.0,1371.0,1371.0,1371.0
mean,0.43141,1.917434,1.400694,-1.1922,0.444931
std,2.842494,5.868359,4.310105,2.101683,0.497139
min,-7.0421,-13.7731,-5.2861,-8.5482,0.0
25%,-1.7747,-1.7113,-1.55335,-2.417,0.0
50%,0.49571,2.3134,0.61663,-0.58665,0.0
75%,2.81465,6.8131,3.1816,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [11]:
# checking the number of missing values in each column
fake_currency.isnull().sum()

var     0
skew    0
curt    0
entr    0
auth    0
dtype: int64

In [12]:
# distribution of legit  & fraudulent currency
fake_currency['auth'].value_counts()

0    761
1    610
Name: auth, dtype: int64

The dataset is fairly balanced

Legit Currency(0) -> 761

Fake Currency(1) -> 610

but for the binary classification task, we need to balance it perfectly.


In [26]:
# Seperating data
target_count = fake_currency.auth.value_counts()


In [27]:
data_to_remove = target_count[0] - target_count[1]
fake_currency = fake_currency.sample(frac=1, random_state=42).sort_values(by='auth')
fake_currency = fake_currency[data_to_remove:]
fake_currency['auth'].value_counts()


0    610
1    610
Name: auth, dtype: int64

Splitting the data into Feaures and Tragets

In [28]:
X = fake_currency.drop(columns='auth', axis= 1)
Y = fake_currency['auth']

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

Modeling Data

In [31]:
model = LogisticRegression()

In [33]:
model.fit(X_train, Y_train)

In [39]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)
print(f'\n Accuracy on training data : {round(training_data_accuracy*100,2)}%')


 Accuracy on training data : 98.95%


In [42]:
#accuracy on testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(f'\n Accuracy on test data : {round(testing_data_accuracy*100,2)}%')


 Accuracy on test data : 99.18%
