In [9]:
import pandas as pd

df=pd.read_csv("user_fake_authentic_4class.csv")

In [10]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        # Ensure X_train is a NumPy array for easier mathematical operations
        self.X_train = X.to_numpy() if isinstance(X, pd.DataFrame) else X
        self.y_train = y.to_numpy() if isinstance(y, pd.Series) else y

    def predict(self, X):
        # Ensure X is a NumPy array for easier iteration over rows
        X = X.to_numpy() if isinstance(X, pd.DataFrame) else X
        predictions = [self._predict(x) for x in X]
        return predictions

    def _predict(self, x):
        # Compute the distance between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Get indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        # Get the labels of the k nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Majority vote, most common class label
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [11]:
df.head()

Unnamed: 0,pos,flw,flg,bl,pic,lin,cl,cz,ni,erl,erc,lt,hc,pr,fo,cs,pi,class
0,69,541,440,293,1,0,223,0.0,0.111,10.12,0.5,0.944,0.556,0.0,0.0,0.107055,334.596802,r
1,9,783,803,76,1,0,101,0.0,0.0,19.82,1.99,0.889,0.222,0.0,0.0,0.0074,2137.979248,r
2,390,1200,925,103,1,0,723,0.0,0.0,11.54,0.76,1.0,2.389,0.0,0.0,0.221954,96.244957,r
3,138,1100,1000,40,1,0,24,0.166667,0.167,20.440001,1.61,0.833,0.111,0.0,0.0,0.025055,226.622437,r
4,1,228,487,84,1,0,30,0.0,0.0,9.21,1.75,1.0,0.0,0.0,0.0,0.0,0.0,r


In [12]:
df.shape

(43307, 18)

In [13]:
df.describe()

Unnamed: 0,pos,flw,flg,bl,pic,lin,cl,cz,ni,erl,erc,lt,hc,pr,fo,cs,pi
count,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0,43307.0
mean,152.551851,827.4772,2817.92821,46.26326,0.932644,0.142448,120.867897,0.291397,0.184412,20.404906,1.117321,0.16909,0.449444,0.044044,0.064475,0.348074,442.005725
std,701.72004,12503.88,2806.099606,60.603408,0.250641,0.349513,212.129187,0.356736,0.258915,146.742244,6.802049,0.280428,1.201003,0.266913,0.624448,0.37686,875.626043
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,107.0,483.0,0.0,1.0,0.0,2.0,0.0,0.0,1.95,0.05,0.0,0.0,0.0,0.0,0.039165,9.990695
50%,22.0,289.0,1400.0,13.0,1.0,0.0,30.0,0.111111,0.056,7.85,0.32,0.0,0.0,0.0,0.0,0.166252,146.030853
75%,102.0,712.5,6100.0,85.0,1.0,0.0,133.0,0.529412,0.278,17.99,0.95,0.231,0.444,0.0,0.0,0.654545,517.563293
max,76200.0,1900000.0,7800.0,555.0,1.0,1.0,3274.0,1.0,1.0,26650.0,1009.090027,1.0,30.0,20.0,58.0,1.0,26786.13477


In [15]:
df.isnull().sum()

pos      0
flw      0
flg      0
bl       0
pic      0
lin      0
cl       0
cz       0
ni       0
erl      0
erc      0
lt       0
hc       0
pr       0
fo       0
cs       0
pi       0
class    0
dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder

In [19]:
encoder=LabelEncoder()
df["class"]=encoder.fit_transform(df["class"])

In [20]:
df.head()

Unnamed: 0,pos,flw,flg,bl,pic,lin,cl,cz,ni,erl,erc,lt,hc,pr,fo,cs,pi,class
0,69,541,440,293,1,0,223,0.0,0.111,10.12,0.5,0.944,0.556,0.0,0.0,0.107055,334.596802,2
1,9,783,803,76,1,0,101,0.0,0.0,19.82,1.99,0.889,0.222,0.0,0.0,0.0074,2137.979248,2
2,390,1200,925,103,1,0,723,0.0,0.0,11.54,0.76,1.0,2.389,0.0,0.0,0.221954,96.244957,2
3,138,1100,1000,40,1,0,24,0.166667,0.167,20.440001,1.61,0.833,0.111,0.0,0.0,0.025055,226.622437,2
4,1,228,487,84,1,0,30,0.0,0.0,9.21,1.75,1.0,0.0,0.0,0.0,0.0,0.0,2


In [21]:
X=df[df.columns[:-1]].values
y=df[df.columns[-1]].values

#### NORMALIZATION

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
data = np.hstack((X, np.reshape(y, (-1, 1))))
transformed_df = pd.DataFrame(data, columns=df.columns)

#### BALANCE THE DATA

In [27]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3


In [29]:
from imblearn.over_sampling import RandomOverSampler
over = RandomOverSampler()
X, y = over.fit_resample(X, y)
data = np.hstack((X, np.reshape(y, (-1, 1))))
transformed_df = pd.DataFrame(data, columns=df.columns)

In [30]:
len(transformed_df[transformed_df["class"]==0]), len(transformed_df[transformed_df["class"]==1]),len(transformed_df[transformed_df["class"]==2]), len(transformed_df[transformed_df["class"]==3])

(12054, 12054, 12054, 12054)

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)

In [32]:
model=KNN(k=5)
model.fit(X_train,y_train)

In [34]:
predictions=model.predict(X_test)

METRICS

In [35]:
# Accuracy
def calculate_accuracy(y_true, y_pred):
    correct = sum(y_true == y_pred)
    accuracy = correct / len(y_true)
    return accuracy

#Precision and Recall
def calculate_precision_recall(y_true, y_pred):
    # Convert to numpy arrays for easier manipulation
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    # True positives: Actual and predicted are 1
    TP = np.sum((y_true == 1) & (y_pred == 1))
    
    # False positives: Actual is 0, but predicted is 1
    FP = np.sum((y_true == 0) & (y_pred == 1))
    
    # False negatives: Actual is 1, but predicted is 0
    FN = np.sum((y_true == 1) & (y_pred == 0))
    
    # Calculate precision and recall
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    return precision, recall


In [38]:
accuracy = calculate_accuracy(y_test, predictions)
print(f"Model accuracy: {accuracy * 100:.2f}%")

Model accuracy: 70.72%


In [37]:
precision, recall = calculate_precision_recall(y_test, predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Precision: 0.9308
Recall: 0.9098
