In [27]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix

# PART A #
# This code takes 1-2 mins to run on my computer (which is fast) because of the pivot and lda training

# Load data
counts_columns = ['row_index', 'term_index', 'count']
counts = pd.read_csv('counts.csv', header=None, names=counts_columns)

with open('vocabulary.csv', 'r') as file:
    words_row = file.readline().strip().split(',')

# create word map from vocab
vocabulary = pd.DataFrame(words_row, columns=['words'], index=range(1, len(words_row) + 1))

# pivot to get a document-term matrix
dtm = counts.set_index('row_index').pivot(columns='term_index', values='count').fillna(0)

# convert to sparse matrix
dtm_sparse = csr_matrix(dtm.values)

# Fit LDA
num_topics = 10  # Set the number of topics
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(dtm_sparse)

# Function to create matrix of top words for topics
def get_top_words_matrix(model, feature_names, n_top_words):
    top_words_matrix = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        top_words_matrix.append(top_words)
    return pd.DataFrame(top_words_matrix).transpose()

n_top_words = 10
feature_names = dtm.columns
top_words_matrix = get_top_words_matrix(lda, feature_names, n_top_words)

top_words_matrix.columns = [f"Topic {i + 1}" for i in range(num_topics)]
top_words_matrix.index = [f"{i + 1}" for i in range(n_top_words)]

# map words to matrix and display
print("Top 10 words for each Topic (a)")
mapped = top_words_matrix.applymap(lambda idx: vocabulary.loc[idx, 'words'])
print(mapped)


Top 10 words for each Topic (a)
    Topic 1 Topic 2  Topic 3   Topic 4 Topic 5  Topic 6  Topic 7  Topic 8  \
1      film    movi     film      film    movi     movi     book     plai   
2      plai   watch     love       war     get     film     film     movi   
3   perform    like    stori     peopl    like  charact     just     good   
4      cast    just     life     world    just    stori    stori    great   
5      role    time    young      time     bad     like     read    music   
6      star   funni  charact      make   scene   realli     like     role   
7   charact   think   famili       man    film     just   realli      get   
8      john    good   beauti      take    kill     time   horror  perform   
9      best   peopl    scene      even    look      end  version     well   
10     well  realli     time  american  horror    watch     seem     best   

   Topic 9 Topic 10  
1     show     film  
2      get     movi  
3   episod      bad  
4     like     like  
5     seri

In [28]:
# PART B #
# This code takes ~1 min to run for the lda training

top_words = dtm.sum(axis=0).nlargest(100).index.tolist()
dtm_filtered = dtm.drop(columns=top_words)
dtm_filtered_sparse = csr_matrix(dtm_filtered.values)

num_topics = 10
lda_filtered = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_filtered.fit(dtm_filtered_sparse)

feature_names_filtered = dtm_filtered.columns
n_top_words = 10
top_words_matrix_filtered = get_top_words_matrix(lda_filtered, feature_names_filtered, n_top_words)

top_words_matrix_filtered.columns = [f"Topic {i + 1}" for i in range(num_topics)]
top_words_matrix_filtered.index = [f"{i + 1}" for i in range(n_top_words)]

# map words to matrix and display
print("Top 10 words for each Topic after filtering (b)")
mapped_filt = top_words_matrix_filtered.applymap(lambda idx: vocabulary.loc[idx, 'words'])
print(mapped_filt)

Top 10 words for each Topic after filtering (b)
   Topic 1     Topic 2  Topic 3  Topic 4  Topic 5    Topic 6  Topic 7  \
1     danc        book     seri    worst     john      enjoi   murder   
2     song        read   episod     wast  michael        saw   killer   
3    black     version   season    minut     jame        kid     hous   
4    white      script     anim   stupid     town  recommend   beauti   
5     sing       novel     last    zombi  western     wonder    becom   
6   number       adapt    minut    monei      big     beauti    woman   
7     rock        line      run  terribl      he'        fun      own   
8     band     product  cartoon     bore   robert        fan  mysteri   
9   school     dialogu   second  horribl      joe      alwai   viewer   
10    high  disappoint    final    laugh   action     famili      art   

    Topic 8      Topic 9   Topic 10  
1    famili          war     effect  
2       he'     american     action  
3       kid        human    specia

In [78]:
# PART C #

with open('sentiment.csv', 'r') as file:
    sent_row = file.readline().strip().split(',')

# create map from sentiment
sentiment = pd.DataFrame(sent_row, columns=['Sentiment'], index=range(1, len(sent_row) + 1))
sentiment = sentiment.to_numpy().astype(int)

topic_distribution = lda_filtered.transform(dtm_filtered_sparse)
dominant_topics = topic_distribution.argmax(axis=1)

reviews_topics_df = pd.DataFrame({'ReviewIndex': range(1, len(dominant_topics) + 1), 'DominantTopic': dominant_topics + 1})
grouped_reviews = reviews_topics_df.groupby('DominantTopic')['ReviewIndex'].apply(list)

top_500_reviews_per_topic = {}
for topic, reviews in grouped_reviews.items():
    top_500_reviews_per_topic[topic] = reviews[:500]


average_sentiments = {}
for topic, indices in top_500_reviews_per_topic.items():
    topic_sentiments = [sentiment[idx] for idx in indices]
    average_sentiments[topic] = sum(topic_sentiments) / len(topic_sentiments)

print(average_sentiments)


{1: array([0.552]), 2: array([0.554]), 3: array([0.528]), 4: array([0.47]), 5: array([0.528]), 6: array([0.508]), 7: array([0.532]), 8: array([0.508]), 9: array([0.46]), 10: array([0.492])}


In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load the dataset
reviews = pd.read_csv('reviews.tsv', sep='\t')

X_positive = reviews[reviews['sentiment'] == 1]['review']
X_negative = reviews[reviews['sentiment'] == 0]['review']

vectorizer_positive = TfidfVectorizer(max_features=2073) #ensures we only model the 2073 most common words
vectorizer_negative = TfidfVectorizer(max_features=2073)

tfidf_matrix_positive = vectorizer_positive.fit_transform(X_positive)
tfidf_matrix_negative = vectorizer_negative.fit_transform(X_negative)

average_tfidf_positive = np.asarray(tfidf_matrix_positive.mean(axis=0)).flatten()
average_tfidf_negative = np.asarray(tfidf_matrix_negative.mean(axis=0)).flatten()
difference = np.abs(average_tfidf_positive - average_tfidf_negative)

feature_names_positive = vectorizer_positive.get_feature_names_out()
feature_names_negative = vectorizer_negative.get_feature_names_out()

results = pd.DataFrame({
    'Words': feature_names_positive + " - " + feature_names_negative,
    'Absolute_Difference': difference
})

results_sorted = results.sort_values(by='Absolute_Difference', ascending=False)

print(results_sorted)

                      Words  Absolute_Difference
1814          theatre - the         2.365198e-01
1812             the - that         1.764906e-01
89             and - amount         1.281453e-01
93              angry - and         1.118331e-01
250              brown - br         1.110023e-01
...                     ...                  ...
487    difference - details         6.397665e-06
127         army - artistic         5.714663e-06
117   appearance - appeared         4.896855e-06
1443         putting - puts         3.809540e-06
454       decides - dancing         8.775057e-07

[2073 rows x 2 columns]
