In [None]:
#Importing all of the needed packages and functions
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [None]:
#import the dataset
df = pd.read_csv("D:/ML/Fake_Bill_dataset/fake_bills.csv", sep=";")

In [None]:
#First rows of a DataFrame
df.head

In [None]:
#Information about the DataFrame
df.info()

In [None]:
#overview of your data
df.describe()

In [None]:
#Counting the null values
df.isnull().sum()

In [None]:
#null values in margin_low column before imputiing
df["margin_low"].isnull().sum()

In [None]:
#Imputing mean to null values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="mean")
imputer.fit_transform(df[["margin_low"]])

In [None]:
#Count of null values after imputing
df["margin_low"] = imputer.fit_transform(df[["margin_low"]])
df["margin_low"].isnull().sum()

In [None]:
label_encoder = LabelEncoder()
df['is_genuine'] = label_encoder.fit_transform(df['is_genuine'])

In [None]:
#displaying the relation between each column
plt.figure(figsize=(10, 10))
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', square=True, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Pairplot to visualize relationships between features
sns.pairplot(df,hue="is_genuine")
plt.title('Pairplot')
plt.show()

In [None]:
#seperating dependent and independent column
X = df.drop(columns = 'is_genuine')
y = df['is_genuine']

In [None]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [None]:
#standardising the data
scaler = StandardScaler()
x_train_tr=scaler.fit_transform(X_train)
x_test_tr=scaler.transform(X_test)

In [None]:
#without standardising
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
ypred = knn.predict(X_test)

In [None]:
#using  standardised data
knn.fit(x_train_tr, y_train)
ypred_scaled = knn.predict(x_test_tr)

In [None]:
#metrics of data without scling
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test, ypred))
print("Accuracy:",accuracy_score(y_test, ypred))

In [None]:
#metrics of data after scling
print(confusion_matrix(y_test, ypred_scaled))
print(classification_report(y_test, ypred_scaled))
print("Accuracy:",accuracy_score(y_test, ypred_scaled))

In [None]:
#checking the best n_neighbours value for unscaled data
error_rate = []
for i in range(1,21):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))
    
plt.figure(figsize=(12,6))
plt.plot(range(1,21), error_rate, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize='10')
plt.show()

In [None]:

knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
ypred = knn.predict(X_test)

In [None]:
#mean of accuracy without scaling
scores = cross_val_score(knn, X_train, y_train, cv=10)
mean_accuracy = np.mean(scores)
mean_accuracy

In [None]:
#checking the best n_neighbours value for scaled data
error_rate = []
for i in range(1,21):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train_tr, y_train)
    pred_i = knn.predict(x_test_tr)
    error_rate.append(np.mean(pred_i != y_test))
    
plt.figure(figsize=(12,6))
plt.plot(range(1,21), error_rate, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize='10')
plt.show()

In [None]:
#using  standardised data
knn=KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train_tr, y_train)
ypred_scaled = knn.predict(x_test_tr)

In [None]:
#mean of accuracy without scaling
scores = cross_val_score(knn, x_train_tr, y_train, cv=10)
mean_accuracy = np.mean(scores)
mean_accuracy