# Fake Social Media Account Detection – POC Notebook

## 1. Introduction
This notebook performs exploration and comparison of ML models for fake account detection.

## 2. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

## 3. Load Dataset

In [None]:
df = pd.read_excel('fake_dataset.xlsx')
df.head()

## 4. Dataset Overview

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

## 5. Remove Irrelevant Columns

In [None]:
columns_to_drop = ['username','user_id','handle','uuid']
df = df.drop(columns=[c for c in columns_to_drop if c in df.columns])
df.head()

## 6. Handle Missing Values

In [None]:
numeric_cols = df.select_dtypes(include=['int64','float64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

categorical_cols = df.select_dtypes(include=['object','bool']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

df.isnull().sum()

## 7. Encode Categorical Features

In [None]:
df = pd.get_dummies(df, drop_first=True)
df.head()

## 8. Train–Test Split

In [None]:
X = df.drop('is_fake', axis=1)
y = df['is_fake']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

## 9. Scale Numeric Features

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 10. Train Models

In [None]:
log_model = LogisticRegression(max_iter=500)
log_model.fit(X_train_scaled, y_train)
log_pred = log_model.predict(X_test_scaled)

In [None]:
rf_model = RandomForestClassifier(n_estimators=200)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

In [None]:
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [None]:
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

## 11. Model Evaluation

In [None]:
def evaluate(y_true, y_pred, name):
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))

evaluate(y_test, log_pred, "Logistic Regression")
evaluate(y_test, rf_pred, "Random Forest")
evaluate(y_test, xgb_pred, "XGBoost")
evaluate(y_test, gb_pred, "Gradient Boosting")

## 12. Confusion Matrices

In [None]:
models = {
    "Logistic Regression": log_pred,
    "Random Forest": rf_pred,
    "XGBoost": xgb_pred,
    "Gradient Boosting": gb_pred
}

for name, preds in models.items():
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(name)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

## 13. Feature Importance

In [None]:
importances = gb_model.feature_importances_
plt.figure(figsize=(10,6))
plt.barh(X.columns, importances)
plt.title("Feature Importance – Gradient Boosting")
plt.show()

## 14. Conclusion
Gradient Boosting performed the best. This notebook validates model selection for deployment.