In [None]:
(5 topics, 10 terms, top 10 words for each topic)

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from itertools import combinations
import re
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing Function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word not in stop_words])
    return text

# Load Dataset
# Replace 'university_of_oxford_tripadvisor_reviews.csv' with your file path if necessary
df = pd.read_csv("university_of_oxford_tripadvisor_reviews.csv")

# Preprocess Text
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Create Bag of Words Representation
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df['cleaned_text'])
vocab = vectorizer.get_feature_names_out()

# Fit LDA Model
n_topics = 5  # Number of topics
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=200)
lda_model.fit(bow_matrix)

# Extract Topics
def extract_topics(lda_model, feature_names, n_top_words=5):
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
        print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")
    return topics

topics = extract_topics(lda_model, vocab, n_top_words=5)

# Coherence Score Calculation
def compute_coherence_score(topics, bow_matrix, vocab):
    # Compute co-occurrence matrix
    co_occurrence_matrix = np.dot(bow_matrix.T, bow_matrix)
    vocab_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    def topic_coherence(topic):
        coherence = 0
        n_pairs = 0
        for word1, word2 in combinations(topic, 2):
            idx1, idx2 = vocab_to_idx[word1], vocab_to_idx[word2]
            co_occurrence = co_occurrence_matrix[idx1, idx2]
            if co_occurrence > 0:
                coherence += np.log(co_occurrence + 1)  # +1 to avoid log(0)
                n_pairs += 1
        return coherence / n_pairs if n_pairs > 0 else 0
    
    return [topic_coherence(topic) for topic in topics]

# Compute and Print Coherence Scores
coherence_scores = compute_coherence_score(topics, bow_matrix, vocab)
for idx, score in enumerate(coherence_scores):
    print(f"Topic {idx + 1} Coherence Score: {score}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pascal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pascal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Topic 1: oxford, university, one, city, day
Topic 2: tour, oxford, university, walking, guide
Topic 3: see, university, visit, harry, potter
Topic 4: colleges, place, college, visitors, amazing
Topic 5: oxford, university, colleges, buildings, visit
Topic 1 Coherence Score: 5.63719159358808
Topic 2 Coherence Score: 5.388683344093934
Topic 3 Coherence Score: 4.9475448521904655
Topic 4 Coherence Score: 4.076473294769559
Topic 5 Coherence Score: 6.113906365989224
