# Download and Pre-process the Dataset

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q parlai
import parlai

In [None]:
from parlai.tasks.image_chat.build import build

datapath='/content/drive/MyDrive/Project_Bias_Meta_paper/'
opt = {'datapath': datapath}
build(opt)

In [3]:
train_df = pd.read_json('/content/drive/MyDrive/Project_Bias_Meta_paper/image_chat/train.json')

In [21]:
# train_df['dialog'][0]

[['Appreciative (Grateful)', 'home sweet home'],
 ['Glamorous', 'in my big house'],
 ['Appreciative (Grateful)', 'Its a house, so like it']]

In [22]:
# len(train_df)

186782

In [23]:
# count_3turn_instances = train_df['dialog'].apply(lambda x: len(x) == 3).sum()
# count_3turn_instances

84540

In [24]:
# count_2turn_instances = train_df['dialog'].apply(lambda x: len(x) == 2).sum()
# count_2turn_instances

0

In [25]:
# count_1turn_instances = train_df['dialog'].apply(lambda x: len(x) == 1).sum()
# count_1turn_instances

102242

In [10]:
df_train_1turn_instances = train_df[train_df['dialog'].apply(lambda x: len(x) == 1)]
df_train_1turn_instances = df_train_1turn_instances.reset_index(drop=True)

In [11]:
df_train_1turn_instances['dialog'] = df_train_1turn_instances['dialog'].apply(lambda x: [(item[1], item[0]) for item in x])
df_train_1turn_instances['dialog'] = df_train_1turn_instances['dialog'].explode()

In [13]:
ls_train_1turn_dialogs = df_train_1turn_instances['dialog'].tolist()

# Load the Styles (Classes)

In [20]:
import json

with open('/content/drive/MyDrive/Project_Bias_Meta_paper/image_chat/personalities.json', 'r') as personalities_file:
    personalities = json.load(personalities_file)

In [21]:
ls_personalities = [item for sublist in personalities.values() for item in sublist]

In [23]:
# len(ls_personalities)

215

In [24]:
ls_personalities.append("Crude")
ls_personalities.append("Earnest")

# Build the Classifier

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

In [28]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [29]:
stop_words = set(stopwords.words('english'))

In [30]:
def preprocess_text(text):
    words = word_tokenize(text)
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    return dict([(word, True) for word in filtered_words])

In [31]:
train_size = int(0.8 * len(ls_train_1turn_dialogs))
train_set, test_set = ls_train_1turn_dialogs[:train_size], ls_train_1turn_dialogs[train_size:]

In [32]:
classifier = NaiveBayesClassifier.train([(preprocess_text(text), label) for text, label in train_set])

In [33]:
# Evaluate the classifier
accuracy_score = accuracy(classifier, [(preprocess_text(text), label) for text, label in test_set])
print(f"Accuracy: {accuracy_score:.2%}")

Accuracy: 1.07%


In [34]:
# Make predictions with probability estimates
new_text = "Something about the pattern calms me"
probabilities = classifier.prob_classify(preprocess_text(new_text))

In [35]:
# Display probability estimates for each class
for label in classifier.labels():
    print(f"Probability for {label}: {probabilities.prob(label):.2%}")

Probability for Discouraging: 0.02%
Probability for Empathetic: 0.15%
Probability for Destructive: 0.04%
Probability for Escapist (Dreamer, Seeks Distraction): 0.15%
Probability for Rational: 0.24%
Probability for Tough: 0.02%
Probability for Scornful: 0.06%
Probability for Considerate: 0.02%
Probability for Sweet: 0.02%
Probability for Earnest (Enthusiastic): 0.05%
Probability for Rustic (Rural): 0.26%
Probability for Absentminded: 0.11%
Probability for Neurotic (Manic, Obsessive): 0.09%
Probability for Artificial: 0.03%
Probability for Romantic: 0.03%
Probability for Conceited (Arrogant, Egotistical): 0.08%
Probability for Daring: 0.10%
Probability for Erratic: 0.08%
Probability for Businesslike: 0.05%
Probability for Vacuous (Empty, Unintelligent): 0.10%
Probability for Wishful: 0.07%
Probability for Opinionated: 0.21%
Probability for Impersonal: 0.05%
Probability for Colorful (Full of Life, Interesting): 0.04%
Probability for Dry: 0.04%
Probability for Sensitive: 0.28%
Probability 

In [36]:
prediction = classifier.classify(preprocess_text(new_text))
print(f"Predicted style: {prediction}")

Predicted style: Crude
