### GPT Labeler

---

In this notebook, we test how well can `GPT 3.5` classify given websites based on the provided context. Specifically, we will try to classify websites based on the following contexts: 

1. tld + domain + metatags
2. context 1 + title + description + keywords
3. context 2 + links + text

In [None]:
%load_ext autoreload
%autoreload 2

import pickle
import os
from openai import OpenAI
from ml_project_2_mlp import gpt
from ml_project_2_mlp import utils


# Load env variables
import dotenv
dotenv.load_dotenv()

import pandas as pd

Setup the OpenAI client:

In [None]:
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"), # Change this to your API name
)

Setup the global flags:

In [None]:
RERUN = False # Set to True to rerun the classification process

Load the web features and labelling info of the **crowdsourced** dataset, for more info about these, check out the [eda notebook](eda.ipynb).

In [None]:
# Features as a dict of dicts where outer dict has as a key webiste id and inner dict are the features
features_path = os.path.join("..", "data", "crowdsourced", "processed", "web_features.pkl")
with open(features_path, 'rb') as f:
    web_features = pickle.load(f)

# Websites with corresponding label - at least 2 votes for each label
websites_labels_path = os.path.join("..", "data", "crowdsourced", "processed", "websites.csv")
websites = pd.read_csv(websites_labels_path)

### Context 1: tld + domain + metatags

---

In [None]:
# Define the context
context1 = [('tld', None), ('domain', None), ('metatags', 10)]

# Define the labeler
c1_lab = gpt.GPTLabeler(client, context1)

In [None]:
folder_path = os.path.join('..', 'data', "tld_domain_meta")
if not os.path.exists(folder_path) or RERUN:

    # Get the labeled data
    c1_out = c1_lab.predict(web_features)

    # Save the labeled data
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(os.path.join(folder_path, 'labeled_data.pkl'), 'wb') as f:
        pickle.dump(c1_out, f)

else:
    with open(os.path.join(folder_path, 'labeled_data.pkl'), 'rb') as f:
        c1_out = pickle.load(f)

### Context 2: context 1 + title + description + keywords

---

In [None]:
# Set the context
context2 = context1 + [('title', None), ('description', None), ('keywords', None)]

# Define the labeler
c2_lab = gpt.GPTLabeler(client, context2)

In [None]:
folder_path = os.path.join('..', 'data', "tld_domain_meta_title_desc_kws")

if not os.path.exists(folder_path) or RERUN:

    # Get the labeled data
    c2_out = c2_lab.predict(web_features)

    # Save the labeled data
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(os.path.join(folder_path, 'labeled_data.pkl'), 'wb') as f:
        pickle.dump(c2_out, f)

else:
    with open(os.path.join(folder_path, 'labeled_data.pkl'), 'rb') as f:
        c2_out = pickle.load(f)

### Context 3: context 2 + links + text

---

In [None]:
# Set the context 
context3 = context2 + [('links', 10), ('sentences', 20)]

# Define the labeler
c3_lab = gpt.GPTLabeler(client, context3)

In [None]:
folder_path = os.path.join('..', 'data', "tld_domain_meta_title_desc_kws_links_text")

if not os.path.exists(folder_path) or RERUN:
    # Get the labeled data
    c3_out = c3_lab.predict(web_features)

    # Save the labeled data
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    with open(os.path.join(folder_path, 'labeled_data.pkl'), 'wb') as f:
        pickle.dump(c3_out, f)

else:
    with open(os.path.join(folder_path, 'labeled_data.pkl'), 'rb') as f:
        c3_out = pickle.load(f)

---