In [2]:
import re

import pandas as pd
import matplotlib.pyplot as plt

from lfd_assignment1 import read_corpus

data, categories = read_corpus('reviews.txt', False)
_, sentiments = read_corpus('reviews.txt', True)

df = pd.DataFrame({
    'category': [category.capitalize() for category in categories], 
    'label': ['Positive' if sentiment == 'pos' else 'Negative' for sentiment in sentiments]})

# Group by 'category' and 'label' and count occurrences
category_counts = df.groupby(['category', 'label']).size().unstack(fill_value=0)

category_counts.plot(kind='bar', stacked=True, color=['red', 'green'])
plt.title('Histogram of Dataset')
plt.xlabel('Category')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=0)
plt.legend(title='Label')
plt.savefig('images/histogram.png')
plt.show()


In [12]:
review_lengths = [len(review) for review in data]

plt.hist(review_lengths, bins=range(1, max(review_lengths)+2), edgecolor='black')
plt.title('Histogram of Token counts')
plt.xlabel('Number of Tokens')
plt.ylabel('Number of Reviews')
plt.savefig('images/token_histogram.png')
plt.show()

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [3]:
# count distinct tokens

pattern = re.compile(r"^[a-zA-Z]+(?:'\w+)?$")

tokens = set([token for review in data for token in review if pattern.match(token)])

print(f'Number of distinct tokens: {len(tokens)}')

In [18]:
from collections import Counter

# filtering stopwords to see most common words, will not do this when fitting ML models
stopword_set = set(stopwords.words('english'))

tokens = [token for review in data for token in review if token.isalpha() and token not in stopword_set]

counter = Counter(tokens)
most_common = counter.most_common(10)
print(most_common)

In [21]:
# Lemmatize the tokens first, see if there's a difference
lemmatizer = WordNetLemmatizer()

counter = Counter([lemmatizer.lemmatize(token) for token in tokens])
most_common = counter.most_common(10)
print(most_common)

In [20]:
from nltk.util import bigrams, trigrams

# Investigating bi- and trigrams
bigrams = [bigram for review in data for bigram in bigrams(review) if all(token.isalpha() for token in bigram) and all(token not in stopword_set for token in bigram)]

counter = Counter(bigrams)
most_common = counter.most_common(10)
print(most_common)

trigrams = [trigram for review in data for trigram in trigrams(review) if all(token.isalpha() for token in trigram) and all(token not in stopword_set for token in trigram)]

counter = Counter(trigrams)
most_common = counter.most_common(10)
print(most_common)