In [65]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
import re
import numpy as np
from prettytable import PrettyTable
import matplotlib.pyplot as plt

# Initialize the NLTK WordNet Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# List of custom words to be excluded from TF-IDF analysis
custom_exclude_words = ['like', 'im', 'dont', 'use', 'skyrim', 'dont', 'ive', 'using', 'need', 'fo4',
                       'mod', 'mods', 'game', 'x200b', 'know', 'want', 'looking', 'make', 'work', 'good',
                       'really', 'way', 'play', 'time', 'tried', 'order', 'modpack', 'trying', 'got', 'playing', 
                       'doesnt', 'vanilla', 'start', 'working', 'wondering', 'look', 'thanks', 'installed',
                       'think', 'file','load', 'thing', 'sure', 'try', 'menu', 'better']

def extract_sentences_with_term(text, term):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    sentences_with_term = [sentence for sentence in sentences if term.lower() in sentence.lower()]
    return sentences_with_term

def lemmatize_text(text):
    return ' '.join([wordnet_lemmatizer.lemmatize(word) for word in text.split()])

with open('combined_data_CD.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract titles, self_text, comments, and replies from each post
documents = []
for post in data:
    # Check if "comments" key exists and is a list
    if "comments" in post and isinstance(post["comments"], list):
        comments_body = [comment["body"] for comment in post["comments"]]
        
        # Check if "replies" key exists in each comment and is a list
        replies_body = [
            reply["body"]
            for comment in post["comments"]
            if "replies" in comment and isinstance(comment["replies"], list)
            for reply in comment["replies"]
        ]
        
        # Combine title, self_text, comments, and replies
        document = post["title"] + " " + post["self_text"] + " ".join(comments_body + replies_body)
        documents.append(lemmatize_text(document))
    else:
        # If "comments" key is missing or not a list, use only title and self_text
        document = post["title"] + " " + post["self_text"]
        documents.append(lemmatize_text(document))

# Get the default English stop words
english_stop_words = TfidfVectorizer(stop_words='english').get_stop_words()

# Combine default English stop words with custom words
exclude_words = list(set(english_stop_words).union(set(custom_exclude_words)))

# Create a TfidfVectorizer with combined stop words
tfidf_vectorizer = TfidfVectorizer(stop_words=exclude_words)

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Optional: Convert the TF-IDF matrix to a dense array for easier handling
dense_tfidf_matrix = tfidf_matrix.toarray()

# Dictionary to store TF-IDF scores for each term
term_tfidf_dict = dict(zip(feature_names, np.sum(dense_tfidf_matrix, axis=0)))

# Sort terms based on their TF-IDF scores in descending order
sorted_terms = sorted(term_tfidf_dict.items(), key=lambda x: x[1], reverse=True)

# Extract top 20 terms
top_n = 20
top_terms = [term[0] for term in sorted_terms[:top_n]]

# Dictionary to store mean sentiment scores for each term
term_sentiments = {}

# Create a table to print TF-IDF terms, TF-IDF scores, and mean sentiment scores
table = PrettyTable()
table.field_names = ["TF-IDF Term", "TF-IDF Score", "Mean Sentiment Score"]

# Lists to store data for scatter plot
tfidf_scores = []
sentiment_scores = []
term_names = []

# Iterate through each term and calculate mean sentiment
for term in top_terms:
    # Use the original term for printing, but use the lemmatized version for sentiment analysis
    original_term = term
    term = lemmatize_text(term)

    sentences_with_term = []
    for post in data:
        combined_text = post["title"] + " " + post["self_text"]
        
        # Include comments and replies for sentiment analysis
        if "comments" in post and isinstance(post["comments"], list):
            combined_text += " ".join(comment["body"] for comment in post["comments"])
            
            for comment in post["comments"]:
                if "replies" in comment and isinstance(comment["replies"], list):
                    combined_text += " ".join(reply["body"] for reply in comment["replies"])

        sentences_with_term.extend(extract_sentences_with_term(combined_text, term))

    sentiment_scores = [TextBlob(sentence).sentiment.polarity for sentence in sentences_with_term]
    mean_sentiment = sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else None
    tfidf_score = term_tfidf_dict[original_term]

    term_sentiments[original_term] = mean_sentiment
    table.add_row([original_term, tfidf_score, mean_sentiment])
    
    # Append data for scatter plot
    tfidf_scores.append(tfidf_score)
    sentiment_scores.append(mean_sentiment)
    
    
# Print the table
print("Top 20 TF-IDF Terms, Their TF-IDF Scores, and Mean Sentiment Scores:")
print(table)



Top 20 TF-IDF Terms, Their TF-IDF Scores, and Mean Sentiment Scores:
+-------------+--------------------+----------------------+
| TF-IDF Term |    TF-IDF Score    | Mean Sentiment Score |
+-------------+--------------------+----------------------+
|     help    | 70.46715072587402  | 0.08754857725708816  |
|    patch    | 70.26160730078566  | 0.10968910391311423  |
|     new     |  66.0696908496249  | 0.10852102132677166  |
|   version   | 63.09831220287657  |  0.1092685866833221  |
|     add     | 59.927751778157145 |  0.1124964082111929  |
|    armor    |  55.9795217408617  | 0.11610211008925439  |
|    crash    | 53.992563780176035 | 0.07865249317719923  |
|    weapon   | 52.54324854156406  | 0.11505172502433282  |
|    issue    | 51.53961491125254  | 0.08015527137970872  |
|     pack    | 49.99589592024177  | 0.11797027220805398  |
|    quest    | 48.26193384805999  | 0.11497342089290091  |
|   overhaul  | 48.22443928706439  | 0.13206520304654745  |
|     fix     | 46.309097261765

In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(tfidf_score, sentiment_scores, c='blue', label='Terms')
plt.title('Scatter Plot of TF-IDF Scores vs Mean Sentiment Scores')
plt.xlabel('TF-IDF Scores')
plt.ylabel('Mean Sentiment Scores')

# Annotate each point with the term name
for i, term_name in enumerate(term_names):
    plt.annotate(term_name, (tfidf_scores[i], sentiment_scores[i]), textcoords="offset points", xytext=(0, 5), ha='center')

plt.legend()
plt.grid(True)
plt.show()

In [54]:
from sklearn.decomposition import LatentDirichletAllocation

# Prepare the data for LDA
lda_vectorizer = TfidfVectorizer(stop_words=exclude_words)
lda_matrix = lda_vectorizer.fit_transform(documents)

# Fit LDA model
num_topics = 2  # You can adjust the number of topics
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(lda_matrix)

# Display the top terms for each topic
top_topic_terms = []
for topic_idx, topic in enumerate(lda_model.components_):
    top_terms_idx = topic.argsort()[:-10 - 1:-1]
    top_terms_values = [feature_names[i] for i in top_terms_idx]
    top_topic_terms.append(top_terms_values)

# Print the top terms for each topic
for i, terms in enumerate(top_topic_terms):
    print(f"\nTopic {i + 1}: {', '.join(terms)}")




Topic 1: 254, fe, overhaulesp, sizet, rotaiting, dayz, rtti, zk383, cprogram, electroblobs

Topic 2: help, patch, new, version, add, armor, crash, weapon, issue, pack


In [55]:
import numpy as np



# Extract terms common to both LDA and TF-IDF
common_terms = set(top_terms) & set(term_tfidf_dict.keys())

# Create arrays to store TF-IDF scores and LDA topic weights for common terms
tfidf_scores = np.array([term_tfidf_dict[term] for term in common_terms])
lda_weights = np.zeros((len(common_terms), num_topics))

# Populate the LDA weights array
for i, term in enumerate(common_terms):
    for topic_idx, topic_terms in enumerate(top_topic_terms):
        if term in topic_terms:
            lda_weights[i, topic_idx] = 1  # Assign a weight of 1 if the term is present in the topic

# Calculate covariance matrix
covariance_matrix = np.cov(tfidf_scores, lda_weights, rowvar=False)

# Extract the covariance between TF-IDF scores and each LDA topic
covariance_tfidf_lda = covariance_matrix[0, 1:]

# Print the covariance values
print("\nCovariance between TF-IDF and LDA:")
for topic_idx, cov_value in enumerate(covariance_tfidf_lda):
    print(f"Topic {topic_idx + 1}: {cov_value}")



Covariance between TF-IDF and LDA:
Topic 1: 0.0
Topic 2: 4.2760927777349025


In [57]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

# Convert the documents to a format suitable for Gensim
tokenized_documents = [doc.split() for doc in documents]
id2word = Dictionary(tokenized_documents)
corpus = [id2word.doc2bow(text) for text in tokenized_documents]

# Fit Gensim LDA model
lda_model_gensim = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42)

# Calculate coherence score
coherence_model_lda = CoherenceModel(model=lda_model_gensim, texts=tokenized_documents, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'\nTopic Coherence Score: {coherence_lda}')



Topic Coherence Score: 0.4735215430424002
