In [307]:
# packages
import pandas as pd
import importlib
import mod02_build_bot_predictor
importlib.reload(mod02_build_bot_predictor)
from mod02_build_bot_predictor import train_model

### Define a function to extract predictions from the model

In [308]:
def predict_bot(df, model=None):
    """
    Predict whether each account is a bot (1) or human (0).
    """
    if model is None:
        model = train_model()

    preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [309]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [310]:
TRAIN_PATH = "mod02_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod02_data/test.csv"
test = pd.read_csv(TEST_PATH)

### Format the data by independent vs. dependent variables

In [311]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [312]:
model = train_model(X_train, y_train)

### Get the model predictions on training and test data

In [313]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model)

### Check results on the training set (data used to build the model)

In [314]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 117,
 'tn': 2594,
 'fp': 43,
 'fn': 246,
 'misclassification_rate': 0.09633333333333334,
 'false_positive_rate': 0.016306408797876374,
 'false_negative_rate': 0.6776859504132231}

### Check results on the test set (new data not yet seen by the model)

In [315]:
confusion_matrix_and_metrics(y_test, y_pred_test)

{'tp': 35,
 'tn': 854,
 'fp': 20,
 'fn': 91,
 'misclassification_rate': 0.111,
 'false_positive_rate': 0.02288329519450801,
 'false_negative_rate': 0.7222222222222222}

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot. 

Based solely on the misclassification rate, my confidence in predicting a bot would be pretty good, as looking at that figure says that misclassifications happen only around 11% of the time, so detecting bots on the platform can be done semi-reliably.

### What are potential ramifications of false positives from the model?

Potential ramifications of false positives could be: real people getting marked as bots, and therefore possibly not being taken seriously, losing followers/engagement (which could be serious for influencers), or even being suspended/banned if the regulations are serious.

### What are potential ramifications of false negatives from the model?

False negatives in the context of this data means bots that are being marked as human. The false negatives tell us that the model does not perform well at all if the goal is to identify bot accounts, with the rate of bots being classified as human is ~72%. This means only ~28% of bots are being caught. The fact that this can be possible with our misclassification rate tells us that our data is too skewed towards human accounts, with not enough representation for bot accounts in the data. This could have a wide array of ramifications, depending on the specific activity of each bot. It could be very harmful, in promoting sites that may steal your information or otherwise malicious things. It also could just result in a bot that spams messages not being caught, which may result in annoyances.