In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Load the dataset
df = pd.read_csv('urbandict-word-defs.csv')  # Replace with the actual dataset filename

# Initialize NLTK stopwords and SentimentIntensityAnalyzer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
stop_words = set(stopwords.words('english'))
sia = SentimentIntensityAnalyzer()

In [None]:
# Define a function for text preprocessing
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)
    
    # Convert to lowercase
    words = [word.lower() for word in words]
    
    # Remove punctuation and stopwords
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    return ' '.join(words)

In [None]:
# Create a new column for preprocessed definitions
df['cleaned_definition'] = df['definition'].apply(preprocess_text)

# Calculate sentiment scores using the SentimentIntensityAnalyzer
df['sentiment_score'] = df['cleaned_definition'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Categorize sentiment based on scores
df['sentiment'] = df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

In [None]:
# Split the dataset for training and testing (you may need more data for better accuracy)
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_definition'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a simple classifier (Multinomial Naive Bayes)
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test_vec)

# Evaluate the classifier
print(classification_report(y_test, y_pred))