# NLTK Workshop Exercises
Using the UCI Sentiment Labelled Sentences Dataset


In [None]:
# Step 1: Load the Dataset
import pandas as pd

def load_data(path):
    data = pd.read_csv(path, sep='\t', header=None, names=['sentence', 'label'])
    return data

# Load from Yelp (repeat for IMDb and Amazon as needed)
data = load_data('yelp_labelled.txt')
data.head()

In [None]:
# Step 2: Tokenization and Stopword Removal
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(sentence):
    tokens = word_tokenize(sentence.lower())
    tokens = [t for t in tokens if t not in stop_words and t not in string.punctuation]
    return tokens

data['tokens'] = data['sentence'].apply(preprocess)
data.head()

In [None]:
# Step 3: Stemming and Lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

data['stemmed'] = data['tokens'].apply(lambda tokens: [stemmer.stem(w) for w in tokens])
data['lemmatized'] = data['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])
data.head()

In [None]:
# Step 4: POS Tagging
nltk.download('averaged_perceptron_tagger')
data['pos_tags'] = data['tokens'].apply(nltk.pos_tag)
data.head()

In [None]:
# Step 5: Sentiment Analysis with VADER
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
data['vader_score'] = data['sentence'].apply(lambda x: sia.polarity_scores(x)['compound'])
data.head()

In [None]:
# Step 6: Feature Extraction and Model Training
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['sentence'])
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, predictions))

In [None]:
# Exercise 7: Perform token frequency analysis for positive and negative sentences separately
# Hint: Use NLTK's FreqDist and compare most common tokens

from nltk import FreqDist

positive_tokens = [token for sent in df[df['label'] == 1]['tokens'] for token in sent]
negative_tokens = [token for sent in df[df['label'] == 0]['tokens'] for token in sent]

positive_freq = FreqDist(positive_tokens)
negative_freq = FreqDist(negative_tokens)

print("Most common words in positive sentences:")
print(positive_freq.most_common(10))

print("\nMost common words in negative sentences:")
print(negative_freq.most_common(10))


In [None]:
# Exercise 8: Identify and visualize bigrams in the sentences
# Hint: Use nltk.bigrams and FreqDist

from nltk.util import bigrams
import matplotlib.pyplot as plt

all_bigrams = list(bigrams([token for sent in df['tokens'] for token in sent]))
bigram_freq = FreqDist(all_bigrams)

print("Most common bigrams:")
print(bigram_freq.most_common(10))

# Plotting
bigram_freq.plot(10, title="Top 10 Most Common Bigrams")


In [None]:
# Exercise 9: Build a simple sentiment classifier using Naive Bayes
# Hint: Use NLTK's NaiveBayesClassifier

from nltk import NaiveBayesClassifier, classify

# Feature extraction: simple word presence
def extract_features(words):
    return {word: True for word in words}

features = [(extract_features(tokens), label) for tokens, label in zip(df['tokens'], df['label'])]

# Split into training and test sets
train_size = int(0.8 * len(features))
train_set, test_set = features[:train_size], features[train_size:]

classifier = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(classifier, test_set)

print("Naive Bayes Classifier Accuracy:", accuracy)
classifier.show_most_informative_features(10)


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Join tokens into a single string for each sentiment
positive_text = ' '.join(positive_tokens)
negative_text = ' '.join(negative_tokens)

# Generate word clouds
positive_wc = WordCloud(width=600, height=400, background_color='white').generate(positive_text)
negative_wc = WordCloud(width=600, height=400, background_color='black', colormap='Reds').generate(negative_text)

# Plot
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.imshow(positive_wc, interpolation='bilinear')
plt.title('Positive Sentiment Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(negative_wc, interpolation='bilinear')
plt.title('Negative Sentiment Word Cloud')
plt.axis('off')

plt.tight_layout()
plt.show()
