In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from itertools import combinations
import re
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing Function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word not in stop_words])
    return text

# Load Dataset
df = pd.read_csv("university_of_oxford_tripadvisor_reviews.csv")
df['cleaned_text'] = df['text'].apply(preprocess_text)


# Function to Extract Topics
def extract_topics(model, vectorizer, n_top_words=10):
    vocab = vectorizer.get_feature_names_out()
    topics = []
    for idx, topic in enumerate(model.components_):
        top_words = [vocab[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
        print(f"Topic {idx + 1}: {', '.join(top_words)}")
    return topics

# Function to Calculate Coherence Scores
def compute_coherence_score(topics, matrix, vocab):
    # Compute co-occurrence matrix
    co_occurrence_matrix = np.dot(matrix.T, matrix)
    vocab_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    def topic_coherence(topic):
        coherence = 0
        n_pairs = 0
        for word1, word2 in combinations(topic, 2):
            idx1, idx2 = vocab_to_idx[word1], vocab_to_idx[word2]
            co_occurrence = co_occurrence_matrix[idx1, idx2]
            if co_occurrence > 0:
                coherence += np.log(co_occurrence + 1)  # +1 to avoid log(0)
                n_pairs += 1
        return coherence / n_pairs if n_pairs > 0 else 0
    
    return [topic_coherence(topic) for topic in topics]

# Bag of Words
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(df['cleaned_text'])

# LSA with BoW
lsa_bow_model = TruncatedSVD(n_components=5, random_state=42)
lsa_bow_model.fit(bow_matrix)

# Extract Topics
lsa_bow_topics = extract_topics(lsa_bow_model, bow_vectorizer)

# Compute Coherence
lsa_bow_coherence = compute_coherence_score(lsa_bow_topics, bow_matrix.toarray(), bow_vectorizer.get_feature_names_out())
for idx, score in enumerate(lsa_bow_coherence):
    print(f"LSA (BoW) - Topic {idx + 1} Coherence Score: {score}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pascal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pascal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Topic 1: oxford, university, colleges, college, visit, one, buildings, see, tour, many
Topic 2: oxford, university, city, cambridge, english, universities, students, town, world, teaching
Topic 3: college, university, colleges, church, merton, street, world, one, oldest, christ
Topic 4: oxford, tour, college, day, city, london, walking, train, trip, street
Topic 5: colleges, city, around, visitors, open, visit, walk, see, well, several
LSA (BoW) - Topic 1 Coherence Score: 5.578871626033516
LSA (BoW) - Topic 2 Coherence Score: 3.624961595470655
LSA (BoW) - Topic 3 Coherence Score: 4.640133441489119
LSA (BoW) - Topic 4 Coherence Score: 4.103232093165759
LSA (BoW) - Topic 5 Coherence Score: 4.22909551398272
