In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample dataset (replace this with your dataset)
data = [
    ("I yelled at my friend, AITA?", "Asshole"),
    ("I accidentally stepped on someone's foot, AITA?", "Not the Asshole"),
    ("I ate the last slice of pizza, AITA?", "Asshole"),
    ("I helped an old lady cross the street, AITA?", "Not the Asshole"),
    ("I told my roommate to clean up their mess, AITA?", "Not the Asshole"),
    ("I accidentally broke my friend's phone, AITA?", "Asshole"),
    ("I donated my old clothes to charity, AITA?", "Not the Asshole"),
    ("I refused to lend money to a friend in need, AITA?", "Not the Asshole"),
    ("I forgot to wish my friend a happy birthday, AITA?", "Asshole"),
    ("I helped a stranger carry groceries, AITA?", "Not the Asshole"),
    ("I criticized my colleague's work in front of others, AITA?", "Asshole"),
    ("I took the blame for someone else's mistake, AITA?", "Not the Asshole"),
    ("I borrowed my neighbor's lawnmower without asking, AITA?", "Asshole"),
    ("I stood up to a bully, AITA?", "Not the Asshole"),
    ("I accidentally spilled coffee on someone, AITA?", "Asshole"),
    ("I defended my friend in an argument, AITA?", "Not the Asshole"),
    ("I forgot to return a borrowed item, AITA?", "Asshole"),
    ("I gave up my seat on the bus to an elderly person, AITA?", "Not the Asshole"),
    ("I lied to my boss to take a day off, AITA?", "Asshole"),
    ("I helped a lost child find their parents, AITA?", "Not the Asshole"),
    ("I criticized a family member's life choices, AITA?", "Asshole"),
    ("I volunteered at a local shelter, AITA?", "Not the Asshole"),
    ("I ignored a call for help from a stranger, AITA?", "Asshole"),
]

# Preprocess data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# Create feature set
all_words = FreqDist()
for text, _ in data:
    for word in preprocess_text(text):
        all_words[word] += 1

word_features = list(all_words.keys())

def extract_features(text):
    words = set(preprocess_text(text))
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

# Split data into training and test sets
featuresets = [(extract_features(text), label) for text, label in data]
train_set, test_set = train_test_split(featuresets, test_size=0.3, random_state=42)

# Train the Naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# Evaluate the classifier
y_true = [label for _, label in test_set]
y_pred = [classifier.classify(features) for features, _ in test_set]
accuracy = accuracy_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.2f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akhas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akhas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\akhas\AppData\Roaming\nltk_data...


Accuracy: 0.00


In [2]:
y_true

['Not the Asshole', 'Not the Asshole']

In [3]:
y_pred

['Asshole', 'Asshole']