# GPT-3 vs Other Text Embeddings Techniques for Text Classification: A Performance Evaluation.

## 1. Data Importation and Preparation

In [2]:
# Libraries
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
import gensim.downloader as api
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import openai
import re

# import data 
df1 = pd.read_csv('https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/data/fine_food_reviews_with_embeddings_1k.csv',
                  index_col=0)

# view first three rows
df1.head(3)

Unnamed: 0,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding
0,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...,52,"[0.007018072064965963, -0.02731654793024063, 0..."
297,B003VXHGPK,A21VWSCGW7UUAR,4,"Good, but not Wolfgang Puck good","Honestly, I have to admit that I expected a li...","Title: Good, but not Wolfgang Puck good; Conte...",178,"[-0.003140551969408989, -0.009995664469897747,..."
296,B008JKTTUA,A34XBAIFT02B60,1,Should advertise coconut as an ingredient more...,"First, these should be called Mac - Coconut ba...",Title: Should advertise coconut as an ingredie...,78,"[-0.01757248118519783, -8.266511576948687e-05,..."


In [12]:
# clean openai embeddings
def clean_emb(text):

# remove line break
    text = re.sub(r'\n', '', text) 

# remove square brackets
    text = re.sub(r'\[|\]', "", text)

# remove leading and trailing white spaces
    text = text.strip() 

# convert string into array
    text = np.fromstring(text, dtype=float, sep=',') 
    
    return text


# Rename column to gpt_3
df1.rename(columns={'embedding': 'gpt_3'}, inplace=True) 

# Apply clean_emb function
df1['gpt_3'] = df1['gpt_3'].apply(lambda x: clean_emb(x))

## 2. Embeddig Generation

### 2.1. GPT-3 Embeddigs

In [None]:
api_key = 'Enter api key here'

# set api key as default api key for openai
openai.api_key = api_key

def get_embedding(text, model="text-embedding-ada-002"):

# replace new lines with spaces
   text = text.replace("\n", " ") 

# openai.Embedding.create to convert text into embedding array
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

### 2.2 GloVe Embeddigs

In [3]:
# Run this in terminal first: python -m spacy download en_core_web_lg
# ! pip install spacy
import spacy

# load pipeline
nlp = spacy.load("en_core_web_lg")

In [4]:
# first text input
df1.combined[0]

'Title: where does one  start...and stop... with a treat like this; Content: Wanted to save some to bring to my Chicago family but my North Carolina family ate all 4 boxes before I could pack. These are excellent...could serve to anyone'

In [5]:
def replace_multiple_fullstops(text):

# replace 2 or more consecutive fullstops with 1
     text = re.sub(r'\.{2,}', '.', text) 

# strip white spaces from ends of sentence
     text= text.strip() 

     return text

# Apply function 
df1['clean_text'] = df1['combined'].apply(lambda x: replace_multiple_fullstops(x))

In [6]:
# Generate embedding vectors in a variable called glove
df1['glove'] = df1['clean_text'].apply(lambda text: nlp(text).vector)

### 2.3 Word2vec Embeddings

In [7]:
import gensim.downloader as api

# Load word2vec-google-news-300 model
wv = api.load("word2vec-google-news-300")

In [8]:
def wv_preprocess_and_vectorize(text):
    # Process the input text using a natural language processing library
    doc = nlp(text)
    
    # Initialize a list to store the filtered tokens
    filtered_tokens = []
    
    # Loop through each token in the doc
    for token in doc:
        # If the token is a stop word or punctuation, skip it
        if token.is_stop or token.is_punct:
            continue
        # Otherwise, add the lemma of the token to the filtered_tokens list
        filtered_tokens.append(token.lemma_)
    
    # If there are no filtered tokens, return np.nan
    if not filtered_tokens:
        return np.nan
    else:
        # Otherwise, return the mean vector representation of the filtered tokens
        return wv.get_mean_vector(filtered_tokens)

# Apply function
df1['word2vec'] = df1['clean_text'].apply(lambda text: wv_preprocess_and_vectorize(text))

### 2.4 MPNet Embeddings

In [9]:
# Load all-mpnet-base-v2 model
model_sent = SentenceTransformer('all-mpnet-base-v2')

# Apply model
df1['mpnet'] = df1['clean_text'].apply(lambda text: model_sent.encode(text))

## 3. Dimensionality Comparison

In [13]:
# assign data of lists.  
data = {'Name': ['gpt_3', 'mpnet', 'word2vec', 'glove'],
         'Dimension': [len(df1.gpt_3[0]), len(df1.mpnet[0]), 
                        len(df1.word2vec[0]), len(df1.glove[0])]}  
  
# Create DataFrame  
df_emb_len = pd.DataFrame(data)  

# Set background style
df_emb_len.style.background_gradient()

Unnamed: 0,Name,Dimension
0,gpt_3,1536
1,mpnet,768
2,word2vec,300
3,glove,300


## 4. Machine Learning


In [14]:
# Define a list of embedding methods to evaluate
embedding_var= ['gpt_3', 'mpnet', 'word2vec', 'glove']

# Define a list of classifier models to use
classifiers = [('rf', RandomForestClassifier(random_state=76)),
                ('svm', SVC(random_state=76)), 
                ('lr', LogisticRegression(random_state=76, max_iter=400)),
                ('dt', DecisionTreeClassifier(random_state=76))]

# Define a dictionary to store accuracy results for each classifier
accuracy_lists = {
    'rf': [],
    'svm': [],
    'lr': [],
    'dt': []
}

# Loop through each embedding method
for emb in embedding_var:

    # Split the data into training and testing sets using the 'train_test_split' function
    X_train, X_test, y_train, y_test = train_test_split(
        df1[emb].values,
        df1.Score,
        test_size=0.25,
        random_state=76
    )

    # Stack the training and testing sets into 3D arrays
    X_train_stacked = np.stack(X_train)
    X_test_stacked = np.stack(X_test)

    # Loop through each classifier model
    for classifier_name, classifier in classifiers:

        # Create a pipeline that scales the data and fits the classifier
        pipe = Pipeline([('scaler', RobustScaler()), (classifier_name, classifier)])
        pipe.fit(X_train_stacked, y_train)

        # Use the pipeline to make predictions on the test data
        y_pred = pipe.predict(X_test_stacked)

        # Evaluate the accuracy of the predictions
        report = classification_report(y_test, y_pred ,output_dict=True)
        acc = report['accuracy']

        # Store the accuracy results for each classifier
        accuracy_lists[classifier_name].append(acc)

In [15]:
# Add a new key 'embeddings' to the dictionary 'accuracy_lists' and assign the list 'embedding_var' to it
accuracy_lists['embeddings'] = embedding_var

# Create a list of tuples using the values from the dictionaries
df_zip = list(zip(accuracy_lists['embeddings'], accuracy_lists['lr'], accuracy_lists['svm'], accuracy_lists['rf'], accuracy_lists['dt']))

# Create a DataFrame 'df_accuracy' from the list 'df_zip' and specify the column names
df_accuracy = pd.DataFrame(df_zip, columns = ['Embedding','Logistic_Regression','Support_Vector_Machine', 'Random_Forest','Decision_Tree'])

# Add a background gradient to the DataFrame for visual representation
df_accuracy.style.background_gradient()


Unnamed: 0,Embedding,Logistic_Regression,Support_Vector_Machine,Random_Forest,Decision_Tree
0,gpt_3,0.832,0.804,0.788,0.692
1,mpnet,0.756,0.764,0.772,0.636
2,word2vec,0.736,0.756,0.776,0.672
3,glove,0.7,0.748,0.772,0.684
