In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report,confusion_matrix

In [2]:
df = pd.read_json("../data/adjacent_change_processed/output_file.json")
print(df.shape)
df.head()

df = df.dropna()

(59756, 20)


In [3]:
import pandas as pd

# Check class distribution
print("Class distribution before balancing:")
print(df['user_class'].value_counts())

# Identify the minority and majority classes dynamically
class_counts = df['user_class'].value_counts()
minority_class = class_counts.idxmin()
majority_class = class_counts.idxmax()

# Split the dataset into majority and minority classes
df_minority = df[df['user_class'] == minority_class]
df_majority = df[df['user_class'] == majority_class]

# Find the size of the minority class
minority_size = len(df_minority)

# Downsample the majority class to the size of the minority class
df_majority_downsampled = df_majority.sample(n=minority_size, random_state=42)

# Combine the downsampled majority class with the minority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the dataset to mix classes
df = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new class distribution
print("Class distribution after balancing:")
print(df['user_class'].value_counts())

Class distribution before balancing:
user_class
bot      32042
human    27703
Name: count, dtype: int64
Class distribution after balancing:
user_class
human    27703
bot      27703
Name: count, dtype: int64


In [4]:
# List of columns to select
columns_to_select = [
    "user_class", 
    # "tweet_count", 
    "change_content_syntactic", 
    "change_action", 
    "change_change_dynamic_score", 
    "highest_change_in_content_syntactic", 
    "lowest_change_in_content_syntactic", 
    "standard_deviation_of_content_syntactic", 
    "highest_change_in_action", 
    "lowest_change_in_action", 
    "standard_deviation_of_action",
    'diversity_action', 
    'diversity_content_syntactic',
    'diversity_change_dynamics_score'
]

# Subset the DataFrame
df_subset = df[columns_to_select]
df_subset.head()

Unnamed: 0,user_class,change_content_syntactic,change_action,change_change_dynamic_score,highest_change_in_content_syntactic,lowest_change_in_content_syntactic,standard_deviation_of_content_syntactic,highest_change_in_action,lowest_change_in_action,standard_deviation_of_action,diversity_action,diversity_content_syntactic,diversity_change_dynamics_score
0,human,0.735221,0.413599,0.474947,1.0,0.0,0.416731,1.0,2.220446e-16,0.307163,0.814237,0.748757,1.233153
1,human,0.512662,0.335346,0.984078,1.0,0.0,0.415535,1.0,2.220446e-16,0.273812,0.688705,0.671567,0.185531
2,bot,0.701685,0.308534,0.83501,1.0,0.16795,0.331267,0.641431,0.0741799,0.188902,0.852464,0.895096,2.118592
3,bot,0.68785,0.591693,0.62759,1.0,0.292893,0.292918,1.0,0.09546597,0.308661,0.799555,0.847495,1.502658
4,bot,0.826534,0.438563,0.627147,1.0,0.0,0.365723,1.0,0.0,0.331568,0.811741,0.763221,1.269834


In [5]:
# Split the data into features and target
X = df_subset.drop(columns=['user_class'])  # Features
labels = df_subset['user_class']                # Target

In [6]:
# Convert target labels to numeric
label_mapping = {'bot': 0, 'human': 1}
labels = labels.map(label_mapping)

In [7]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


In [8]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform training data
X_test_scaled = scaler.transform(X_test)        # Transform testing data


In [9]:
print(X_train_scaled.shape)
print(y_train.shape)

import numpy as np

# Check if there are any missing values (NaN) in X_train_scaled
print(np.isnan(X_train_scaled).sum())  # Count of NaN values in the training data



(38784, 12)
(38784,)
0


In [10]:
import pandas as pd
import numpy as np

# Convert X_train_scaled (NumPy array) back to a DataFrame
X_train_df = pd.DataFrame(X_train_scaled, columns=X.columns)  # Assuming X.columns exist

# Check if any NaN values exist and display rows with NaN values
nan_rows = X_train_df[X_train_df.isnull().any(axis=1)]  # Rows where any value is NaN

print(nan_rows)  # This will print all rows with NaN values


Empty DataFrame
Columns: [change_content_syntactic, change_action, change_change_dynamic_score, highest_change_in_content_syntactic, lowest_change_in_content_syntactic, standard_deviation_of_content_syntactic, highest_change_in_action, lowest_change_in_action, standard_deviation_of_action, diversity_action, diversity_content_syntactic, diversity_change_dynamics_score]
Index: []


In [11]:
# Train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust `n_neighbors`
knn.fit(X_train_scaled, y_train)

In [12]:
# Make predictions
y_pred = knn.predict(X_test_scaled)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multiclass
recall = recall_score(y_test, y_pred, average='weighted')        # Use 'weighted' for multiclass
f1 = f1_score(y_test, y_pred, average='weighted')                # Use 'weighted' for multiclass

In [14]:
# Display evaluation metrics
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Confusion Matrix and Classification Report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Model Evaluation Metrics:
Accuracy: 0.8592
Precision: 0.8594
Recall: 0.8592
F1 Score: 0.8591

Confusion Matrix:
[[6994 1275]
 [1066 7287]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      8269
           1       0.85      0.87      0.86      8353

    accuracy                           0.86     16622
   macro avg       0.86      0.86      0.86     16622
weighted avg       0.86      0.86      0.86     16622

