In [12]:
import pandas as pd
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [13]:
# Read the data and perform preprocessing

df = pd.read_csv("data\df_all_cleaned_preprocessed.csv") # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

summary = df["processed_post"].tolist() # Create a list containing all article summaries

df

(155, 7)


Unnamed: 0,auhtor_ID,post,extrovert,feeling,judging,sensing,processed_post
0,t2_12bhu7,I wear a Lorna shore shirt out alot in public ...,1.0,1.0,0.0,0.0,wear lorna shore shirt alot public lewd long s...
1,t2_12jbpd,I'd say this is a very accurate characterizati...,1.0,0.0,0.0,0.0,id say accurate characterization ni users read...
2,t2_12uwr5,Ya know like most people with home decorations...,0.0,0.0,1.0,0.0,ya know like people home decorations could sav...
3,t2_12zm15,It's true tho. They're kinda more interesting ...,0.0,1.0,0.0,0.0,true tho theyre kinda interesting buuuut issue...
4,t2_13cjjl,"Yeah, but that's one of the things that make m...",0.0,0.0,0.0,1.0,yeah thats one things make better objectively ...
...,...,...,...,...,...,...,...
150,t2_vfp8y,so change profession then. this would be inadm...,0.0,0.0,1.0,0.0,change profession would inadmissible country p...
151,t2_w0842,The technological singularity. And the possibi...,0.0,0.0,1.0,0.0,technological singularity possibility contribu...
152,t2_w6rgl,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,0.0,1.0,0.0,dear god man chill im einstein hawking serious...
153,t2_wilcwvo,That's what a fake lib would say [Human blood ...,1.0,0.0,0.0,0.0,thats fake lib would say human blood water url...


In [14]:
#Run this to get the zero shot classification result. Will take awhile.
results = []

tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33", use_fast=False)
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33")

for text in tqdm(summary):

    context = text
    sequence_to_classify = context

     # Tokenize the text and truncate if needed
    inputs = tokenizer(sequence_to_classify, return_tensors="pt", truncation=True, max_length=512)

    # Perform zero-shot classification
    output = classifier([sequence_to_classify], candidate_labels=["Feeling", 'Thinking'], multi_label=False)

    results.append(output)

100%|██████████| 155/155 [5:16:18<00:00, 122.44s/it]


In [15]:
# Sorting the labels and scores for DeBERTa

labels = []
scores =[]

for i, res in enumerate(results):
    labels.append(res[0]['labels'][0])
    scores.append(res[0]['scores'][0])

df['Zeroshot'] = labels
df['Scores for Zeroshot'] = scores


df

Unnamed: 0,auhtor_ID,post,extrovert,feeling,judging,sensing,processed_post,Zeroshot,Scores for Zeroshot
0,t2_12bhu7,I wear a Lorna shore shirt out alot in public ...,1.0,1.0,0.0,0.0,wear lorna shore shirt alot public lewd long s...,Thinking,0.530346
1,t2_12jbpd,I'd say this is a very accurate characterizati...,1.0,0.0,0.0,0.0,id say accurate characterization ni users read...,Thinking,0.994703
2,t2_12uwr5,Ya know like most people with home decorations...,0.0,0.0,1.0,0.0,ya know like people home decorations could sav...,Feeling,0.975634
3,t2_12zm15,It's true tho. They're kinda more interesting ...,0.0,1.0,0.0,0.0,true tho theyre kinda interesting buuuut issue...,Feeling,0.731272
4,t2_13cjjl,"Yeah, but that's one of the things that make m...",0.0,0.0,0.0,1.0,yeah thats one things make better objectively ...,Thinking,0.958526
...,...,...,...,...,...,...,...,...,...
150,t2_vfp8y,so change profession then. this would be inadm...,0.0,0.0,1.0,0.0,change profession would inadmissible country p...,Thinking,0.586138
151,t2_w0842,The technological singularity. And the possibi...,0.0,0.0,1.0,0.0,technological singularity possibility contribu...,Thinking,0.579636
152,t2_w6rgl,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,0.0,1.0,0.0,dear god man chill im einstein hawking serious...,Thinking,0.997763
153,t2_wilcwvo,That's what a fake lib would say [Human blood ...,1.0,0.0,0.0,0.0,thats fake lib would say human blood water url...,Thinking,0.996589


In [16]:
# convert Zeroshot values to 1 and 0 for extrovert and introvert respectively
df['Zeroshot'] = df['Zeroshot'].replace({'Feeling': 1, 'Thinking': 0})

df.to_csv("data/df_feeling_thinking.csv", index=False)

In [18]:
df

Unnamed: 0,auhtor_ID,post,extrovert,feeling,judging,sensing,processed_post,Zeroshot,Scores for Zeroshot
0,t2_12bhu7,I wear a Lorna shore shirt out alot in public ...,1.0,1.0,0.0,0.0,wear lorna shore shirt alot public lewd long s...,0,0.530346
1,t2_12jbpd,I'd say this is a very accurate characterizati...,1.0,0.0,0.0,0.0,id say accurate characterization ni users read...,0,0.994703
2,t2_12uwr5,Ya know like most people with home decorations...,0.0,0.0,1.0,0.0,ya know like people home decorations could sav...,1,0.975634
3,t2_12zm15,It's true tho. They're kinda more interesting ...,0.0,1.0,0.0,0.0,true tho theyre kinda interesting buuuut issue...,1,0.731272
4,t2_13cjjl,"Yeah, but that's one of the things that make m...",0.0,0.0,0.0,1.0,yeah thats one things make better objectively ...,0,0.958526
...,...,...,...,...,...,...,...,...,...
150,t2_vfp8y,so change profession then. this would be inadm...,0.0,0.0,1.0,0.0,change profession would inadmissible country p...,0,0.586138
151,t2_w0842,The technological singularity. And the possibi...,0.0,0.0,1.0,0.0,technological singularity possibility contribu...,0,0.579636
152,t2_w6rgl,Dear God man. Chill. I'm not Einstein or Hawki...,0.0,0.0,1.0,0.0,dear god man chill im einstein hawking serious...,0,0.997763
153,t2_wilcwvo,That's what a fake lib would say [Human blood ...,1.0,0.0,0.0,0.0,thats fake lib would say human blood water url...,0,0.996589


In [19]:
# get accuracy score on zeroshot


y_true = df['feeling'].tolist()
y_pred = df['Zeroshot'].tolist()

print(accuracy_score(y_true, y_pred))

# do classification on zeroshot


print(classification_report(y_true, y_pred))

# do confusion matrix on zeroshot

print(confusion_matrix(y_true, y_pred))

0.6580645161290323
              precision    recall  f1-score   support

         0.0       0.77      0.73      0.75       109
         1.0       0.43      0.48      0.45        46

    accuracy                           0.66       155
   macro avg       0.60      0.61      0.60       155
weighted avg       0.67      0.66      0.66       155

[[80 29]
 [24 22]]
