# Model testing
## 1. Setup data

In [2]:
import json

In [3]:
import os
os.getcwd()

'/home/3cho11/Documents/PyroBlocker/modelResearch'

In [4]:
# import data from ../dataset/miniData.json
with open("../miniData/miniData.json", "r") as f:
    safeData = json.load(f)
with open("../miniData/miniDataAdult.json", "r") as f:
    adultData = json.load(f)
# get inbween data-points
with open("../miniData/inbetweenSafe.json", "r") as f:
    inBetweenData = json.load(f)
# combine datasets
mainData = safeData + adultData + inBetweenData

### Get no. of data-points

In [5]:
print("Size of safeData: ", len(safeData))
print("Size of adultData: ", len(adultData))
print("Size of inBetweenData: ", len(inBetweenData))

Size of safeData:  50
Size of adultData:  50
Size of inBetweenData:  54


### Get list of features

In [6]:
# Get list of feature labels in each Data.json
featureLabels = list(safeData[0].keys())
print(featureLabels)
for feature in featureLabels:
    print(type(safeData[0][feature]))

['url', 'title', 'meta_description', 'headings', 'main_content', 'links', 'label']
<class 'str'>
<class 'str'>
<class 'str'>
<class 'dict'>
<class 'list'>
<class 'dict'>
<class 'str'>


# Setup Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Example data: X (features) and y (target)
# For demonstration, let's assume we have some numerical features and a target variable
X = np.array([[1, 2], [2, 3], [4, 5], [6, 7]])
y = np.array([1, 2, 3, 4])

# Initialize the Linear Regression model
linear_regressor = LinearRegression()

# Fit the model to the data
linear_regressor.fit(X, y)

# Print the coefficients and intercept
print("Coefficients:", linear_regressor.coef_)
print("Intercept:", linear_regressor.intercept_)

Coefficients: [0.28813559 0.28813559]
Intercept: 0.3389830508474567


# Model Setup
### Login to huggingface

In [None]:
from huggingface_hub import login

# Login to Hugging Face using your access token
login(token="your_token_here")

from transformers import pipeline

## Model 1
### bert-large-uncased-Adult-Text-Classifier

In [9]:
# Load the adult text classifier pipeline 
# https://huggingface.co/lazyghost/bert-large-uncased-Adult-Text-Classifier
rawClassifier1 = pipeline("text-classification", model="lazyghost/bert-large-uncased-Adult-Text-Classifier")

In [10]:
def transformLabel1(prediction):
    """
    Re-format labels to allow for comparison with data

    Args:
        predictions (Array (of {label, score} objects)): raw predictions

    Returns:
        Arr[Object]: prediction object with label and score
    """
    if prediction['label'] == 'Adult':
        prediction['label'] = 'adult'
    else:
        prediction['label'] = 'safe'
    return prediction

## Model 2
### valurank/finetuned-distilbert-adult-content-detection

In [11]:
# Load the finetuned-distilbert-adult-content-detection pipeline
rawClassifier2 = pipeline("text-classification", model="valurank/finetuned-distilbert-adult-content-detection")

In [22]:
sampleTest= "porn"
prediction = rawClassifier2(sampleTest)
print(prediction)

[{'label': 'LABEL_1', 'score': 0.9998865127563477}]


In [12]:
def transformLabel2(prediction):
    """
    Re-format labels to allow for comparison with data

    Args:
        predictions (Array (of {label, score} objects)): raw predictions

    Returns:
        Arr[Object]: prediction object with label and score
    """
    if prediction['label'] == 'LABEL_1':
        prediction['label'] = 'adult'
    else:
        prediction['label'] = 'safe'
    return prediction

## Model 3
### michellejieli/inappropriate_text_classifier

In [13]:
rawClassifier3 = pipeline("text-classification", model="michellejieli/inappropriate_text_classifier")

In [14]:
print(rawClassifier3("bad"))

[{'label': 'NSFW', 'score': 0.932813286781311}]


In [15]:
def transformLabel3(prediction):
    """
    Re-format labels to allow for comparison with data

    Args:
        predictions (Array (of {label, score} objects)): raw predictions

    Returns:
        Arr[Object]: prediction object with label and score
    """
    if prediction['label'] == 'NSFW':
        prediction['label'] = 'adult'
    else:
        prediction['label'] = 'safe'
    return prediction

In [16]:
print(rawClassifier3("bad"))

[{'label': 'NSFW', 'score': 0.932813286781311}]


In [17]:
models = [rawClassifier1, rawClassifier2, rawClassifier3]
transformLabel = [transformLabel1, transformLabel2, transformLabel3]

### Generic classifier that can use any model

In [18]:
def classifier(modelNum, text):
    prediction = models[modelNum](text)[0]
    prediction = transformLabel[modelNum](prediction)
    score = prediction['score']
    # make safe prediction negative
    score = prediction['score'] * (-1 if prediction['label'] == 'safe' else 1)
    return score

In [19]:
def predict(modelNum, text):
    prediction = (text, classifier(modelNum, text))
    return prediction

In [20]:
predictions = []
for data_point in adultData:
    text = data_point['title']
    predictions.append(predict(2, text))


KeyboardInterrupt: 

In [32]:
import pandas as pd

# Convert predictions to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['Text', 'Score'])

# Display the DataFrame
print(predictions_df)

                                                 Text     Score
0                                 Online Casino Sites  0.616833
1       escort agencies and independent escorts in UK -0.521568
2   Milf Fox - Best Net's Milfs in Hot Milf Porn M...  0.949091
3           Forums - Ladyboy Forum - Pattaya Ladyboys  0.911162
4   Best Gay Sex - Free young boys galleries: pret...  0.844273
5   Free Porn Tube - Sex movies, XXX, Porno videos...  0.933980
6   Adult FriendFinder - The World's Largest Adult...  0.854414
7   18Magazine.com - 18 Magazine - Hot Teen Models...  0.915747
8                                  MewSlut's OnlyFans  0.878635
9   nJAV - Watch HD JAV Online | Free & High Quali... -0.976232
10    Best Porn Toplist of Worlds Best Porn XXX Sites  0.926478
11                                                     0.507657
12  DeepthroatLove.com - Deepthroat Gagging Porn d...  0.966145
13  Hardcore Anal Porn & Gonzo XXX Videos | Evil A...  0.959794
14              SexyHub - Erotic Porn Si

In [33]:
from sklearn.metrics import accuracy_score

# Assuming predictions_df contains the true labels and predicted scores
# You need to convert the scores back to labels for comparison
predictions_df['Predicted_Label'] = predictions_df['Score'].apply(lambda x: 'safe' if x < 0 else 'adult')

# Assuming you have the true labels in a list called true_labels
true_labels = [data_point['label'] for data_point in adultData]

# Calculate the accuracy score
accuracy = accuracy_score(true_labels, predictions_df['Predicted_Label'])
print("Accuracy Score:", accuracy)

Accuracy Score: 0.82


## Classifying arrays

### Use power mean to emphasize larger values
$$
    M_p=\left(\frac{1}{n}\sum_{i=1}^nx_i^p\right)^{\frac{1}{p}}
$$

In [25]:
def signed_power_mean(arr, p):
    def power_preserve_sign(x, p):
        print(x)
        return abs(x)**p * (-1 if x < 0 else 1)

    if not arr:
        raise ValueError("Array must not be empty.")
    transformed = [power_preserve_sign(x, p) for x in arr]
    mean = sum(transformed) / len(arr)
    return abs(mean)**(1/p) * (-1 if mean < 0 else 1)

In [26]:
def powerMean(arr, p):
    """
    Compute the power mean of an array

    Args:
        arr (Array): array of numbers
        p (float): power

    Returns:
        float: power mean of the array
    """
    
    return (sum([x**p for x in arr])/len(arr))**(1/p)

In [27]:
def classifyArray(modelNum, textArray):        
    """
    Given an array of strings, get the average prediction score of the strings

    Args:
        textArray (array): array of h1 or h2 headers
    
    Returns:
        float: average prediction of all strings in the array
    """
    predictions = []
    for text in textArray:
        print("curr text: ", text)
        print(classifier(modelNum, text))
        predictions.append(classifier(modelNum, text))
    print("predictions: ", predictions)
    return signed_power_mean(predictions, 5)

In [28]:
def classifyFeature(modelNum, featureName, featureData):
    match featureName:
        case 'url':
            prediction = (featureName, classifier(modelNum, featureData))
        case 'title':
            prediction = (featureName, classifier(modelNum, featureData))
        case 'meta_description':
            prediction = (featureName, classifier(modelNum, featureData))         
        case 'headings': # headings (dist) is in form {h1: [], h2: []}
            # split h1 and h2
            prediction = [('h1', classifyArray(modelNum, featureData['h1'])), ('h2', classifyArray(modelNum, featureData['h2']))]
        case 'main_content':
            prediction = (featureName, classifyArray(modelNum, featureData))
        case 'links':
            prediction = [('internal', classifyArray(modelNum, featureData['internal'])), ('external', classifyArray(modelNum, featureData['external']))]
        case 'label':
            print(featureData)
        case _:
            print("Unknown feature")
    return prediction

### Get array of predicted values for each feature in a data-point

In [29]:
def splitByFeature(modelNum, data_point):
    predictionValues = []
    for feature in featureLabels[:-1]:  # select all but the last (label) element
        predictionValues.append(classifyFeature(modelNum, feature, data_point[feature]))
    return predictionValues

In [30]:
data_point = safeData[0]
predictionValues = splitByFeature(2, data_point)

curr text:  Lorde Shows Off New Blonde Hair While Performing at 2022 Glastonbury Music Festival
-0.6449945569038391


KeyboardInterrupt: 

In [66]:
print("url", data_point['url'])
print(predictionValues)

url peoplestyle/lorde-debuts-blonde-hair-at-2022-glastonbury-music-festival
[('url', -0.9592427015304565), ('title', -0.8769768476486206), ('meta_description', 0.5218024849891663), [('h1', -0.6449945569038391), ('h2', 0.5757089129447089)], ('main_content', 0.3757687961289026), [('internal', -0.5320969758884327), ('external', -0.5191652553348265)]]


### Format scores for binomial logistic regression

In [15]:
# Extract just the score values from examplePredictions
def extractScores(predictions):
    scores = []
    for item in predictions:
        if isinstance(item, list):
            scores.extend([score for _, score in item])
        else:
            _, score = item
            scores.append(score)
    return scores

In [73]:
data_point = safeData[0]
predictionValues = splitByFeature(data_point)
print(predictionValues)
exampleFeatureVector = extractScores(predictionValues)
print(exampleFeatureVector)
label = data_point['label']
print(label)

[('url', 0.9997495412826538), ('title', 0.9996457695960999), ('meta_description', 0.0010077357292175293), [('h1', 0.9995507001876831), ('h2', 0.0009156954170826574)], ('main_content', 0.8751240634266553), [('internal', 0.7015515942462013), ('external', 0.852614122313233)]]
[0.9997495412826538, 0.9996457695960999, 0.0010077357292175293, 0.9995507001876831, 0.0009156954170826574, 0.8751240634266553, 0.7015515942462013, 0.852614122313233]
safe


In [70]:
data_point = safeData[0]
predictionValues = splitByFeature(data_point)
print(predictionValues)
exampleFeatureVector = extractScores(predictionValues)
print(exampleFeatureVector)
label = data_point['label']
print(label)

[('url', 0.008181780576705933), ('title', 0.01878821849822998), ('meta_description', 0.031011849641799927), [('h1', 0.01920390129089355), ('h2', 0.009735441229325457)], ('main_content', 0.7026501597776854), [('internal', 0.0028665306574306992), ('external', 0.6191035969078398)]]
[0.008181780576705933, 0.01878821849822998, 0.031011849641799927, 0.01920390129089355, 0.009735441229325457, 0.7026501597776854, 0.0028665306574306992, 0.6191035969078398]
safe


In [65]:
def getAccuracy1(data):
    correct = 0
    for data_point in data:
        prediction = rawClassifier1(data_point['title'])[0]
        prediction = transformLabels1(prediction)
        if prediction["label"] == data_point["label"]:
            correct += 1
    return correct / len(data)

In [67]:
def getAccuracy2(data):
    correct = 0
    for data_point in data:
        prediction = rawClassifier2(data_point['title'])[0]
        prediction = transformLabels2(prediction)
        if prediction["label"] == data_point["label"]:
            correct += 1
    return correct / len(data)

In [68]:
print("Accuracy of Safe Data: ", getAccuracy1(safeData))
print("Accuracy of Safe Data under model 2: ", getAccuracy2(safeData))
print("Accuracy of Adult Data: ", getAccuracy1(adultData))
print("Accuracy of Adult Data under model 2: ", getAccuracy2(adultData))
print("Accuracy of InBetween Data: ", getAccuracy1(inBetweenData))
print("Accuracy of InBetween Data under model 2: ", getAccuracy2(inBetweenData))
print("Accuracy of Main Data: ", getAccuracy1(mainData))
print("Accuracy of Main Data under model 2: ", getAccuracy2(mainData))

Accuracy of Safe Data:  0.88
Accuracy of Safe Data under model 2:  0.58
Accuracy of Adult Data:  0.7
Accuracy of Adult Data under model 2:  0.88
Accuracy of InBetween Data:  0.5740740740740741
Accuracy of InBetween Data under model 2:  0.3888888888888889
Accuracy of Main Data:  0.7142857142857143
Accuracy of Main Data under model 2:  0.6103896103896104


In [69]:

print("Accuracy of equal Data: ", getAccuracy1(safeData + adultData))
print("Accuracy of equal Data under model 2: ", getAccuracy2(safeData + adultData))

Accuracy of equal Data:  0.79
Accuracy of equal Data under model 2:  0.73


In [80]:
for i in range(5):
    print(safeData[i]['title'])
    print(rawClassifier1(safeData[i]['title']))
    print(rawClassifier2(safeData[i]['title']))

Lorde Debuts New Blonde Hair at 2022 Glastonbury Music Festival
[{'label': 'Non_Adult', 'score': 0.96242356300354}]
[{'label': 'LABEL_1', 'score': 0.9992915391921997}]
Queen Elizabeth Makes Generous Donation to Aid Ukrainians Amid Russia's Invasion
[{'label': 'Non_Adult', 'score': 0.9939538836479187}]
[{'label': 'LABEL_0', 'score': 0.9981610178947449}]
Blake Shelton Brings Boy, 6, in Need of Heart Transplant Up on Stage
[{'label': 'Non_Adult', 'score': 0.9826527833938599}]
[{'label': 'LABEL_0', 'score': 0.96591717004776}]
Tribute Dinner honoring Dominique Crenn and Gérard Bertrand part of The New York Times Dinner Series | Sat, Feb 22 7:00 PM
[{'label': 'Non_Adult', 'score': 0.9969825148582458}]
[{'label': 'LABEL_0', 'score': 0.9994176626205444}]
Filiz Robe | Luxury Unisex Cotton Handwoven Turkish Robe in Denim Blue Striped | Sustainably Made – OddBird Co.
[{'label': 'Adult', 'score': 0.5650986433029175}]
[{'label': 'LABEL_1', 'score': 0.9994252920150757}]


## Model 3
### BERTopic