In [13]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from itertools import combinations
import re
import nltk

# Preprocessing Function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word not in stop_words])
    return text


# Load Dataset
df = pd.read_csv("university_of_oxford_tripadvisor_reviews.csv")
df['cleaned_text'] = df['text'].apply(preprocess_text)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

# Function to Calculate Coherence Scores
def compute_coherence_score(topics, matrix, vocab):
    # Compute co-occurrence matrix
    co_occurrence_matrix = np.dot(matrix.T, matrix)
    vocab_to_idx = {word: idx for idx, word in enumerate(vocab)}
    
    def topic_coherence(topic):
        coherence = 0
        n_pairs = 0
        for word1, word2 in combinations(topic, 2):
            idx1, idx2 = vocab_to_idx[word1], vocab_to_idx[word2]
            co_occurrence = co_occurrence_matrix[idx1, idx2]
            if co_occurrence > 0:
                coherence += np.log(co_occurrence + 1)  # +1 to avoid log(0)
                n_pairs += 1
        return coherence / n_pairs if n_pairs > 0 else 0
    
    return [topic_coherence(topic) for topic in topics]

# Function to Extract Topics
def extract_topics(model, vectorizer, n_top_words=5):
    vocab = vectorizer.get_feature_names_out()
    topics = []
    for idx, topic in enumerate(model.components_):
        top_words = [vocab[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
        print(f"Topic {idx + 1}: {', '.join(top_words)}")
    return topics

# LDA with TF-IDF
lda_tfidf_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_tfidf_model.fit(tfidf_matrix)

# Extract Topics
lda_tfidf_topics = extract_topics(lda_tfidf_model, tfidf_vectorizer)

# Compute Coherence
lda_tfidf_coherence = compute_coherence_score(lda_tfidf_topics, tfidf_matrix.toarray(), tfidf_vectorizer.get_feature_names_out())
for idx, score in enumerate(lda_tfidf_coherence):
    print(f"LDA (TF-IDF) - Topic {idx + 1} Coherence Score: {score}")


Topic 1: architecture, oxford, visit, tour, see
Topic 2: oxford, colleges, university, city, history
Topic 3: oxford, university, visit, colleges, place
Topic 4: oxford, university, around, colleges, tour
Topic 5: university, history, oxford, buildings, tour
LDA (TF-IDF) - Topic 1 Coherence Score: 0.734057596642711
LDA (TF-IDF) - Topic 2 Coherence Score: 1.0277233071725544
LDA (TF-IDF) - Topic 3 Coherence Score: 1.0267885819490714
LDA (TF-IDF) - Topic 4 Coherence Score: 1.0090933521405767
LDA (TF-IDF) - Topic 5 Coherence Score: 0.9618031720746231
