In [None]:
pip install praw

In [None]:
import praw
import pandas as pd
import datetime

# Initialize PRAW with your Reddit API credentials
reddit = praw.Reddit(client_id='',
                     client_secret='',
                     user_agent='')

# Let's define a function to scrape the top posts in a subreddit

def scrape_subreddit(subreddit_name, num_posts):
    # Fetch the specified subreddit
    subreddit = reddit.subreddit(subreddit_name)
    
    # Fetch the top 'num_posts' posts from the subreddit for the year
    top_posts_year = subreddit.top(limit=num_posts, time_filter='year')
    
    # List to store scraped data
    data = []
    
    # Iterate through the top posts and store their titles, scores, number of comments, and content
    for post in top_posts_year:
        # Check if the post title or content contains relevant keywords (e.g., 'housing') <= This is only for more general subreddits like r/Canada
        if 'housing' in post.title.lower() or 'housing' in post.selftext.lower():
            post_data = {
                "Title": post.title,
                "Score": post.score,
                "Content": post.selftext if post.selftext else None,
                "Number of Comments": post.num_comments,
                #"Author": post.author.name if post.author else None,
                "Post URL": post.url,
                "Full URL": f"https://www.reddit.com{post.permalink}"
            }
            data.append(post_data)
    
    return data

if __name__ == "__main__":
    subreddit_name = "Canada"
    num_posts = 1000
    
    print(f"\nScraping top {num_posts} posts from r/{subreddit_name}...\n")
    scraped_data = scrape_subreddit(subreddit_name, num_posts)
    
    # Create DataFrame from scraped data
    df = pd.DataFrame(scraped_data)
    
    # Print the first 10 rows of the DataFrame
    print(df.head(10))

In [None]:
pip install gensim

In [None]:
!pip install pyLDAvis

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim.models import LdaModel
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

nltk.download('punkt')
nltk.download('stopwords')

# Define stopwords and stemmer
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Define a function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        # Tokenize the text
        tokens = word_tokenize(text)
        # Convert tokens to lowercase and remove non-alphanumeric characters
        tokens = [token.lower() for token in tokens if token.isalnum()]
        # Remove stopwords
        tokens = [token for token in tokens if token not in stop_words]
        # Stem the tokens
        stemmed_tokens = [ps.stem(word) for word in tokens]
        return stemmed_tokens
    else:
        return []  # Return an empty list for NaN values

# Define the number of topics
num_topics = 10

# Define the number of top words to display for each topic
n_top_words = 7

# Apply preprocessing to the 'Content' column
df['preprocessed_text'] = df['Content'].apply(preprocess_text)

# Create a dictionary from the preprocessed text
dictionary = corpora.Dictionary(df['preprocessed_text'])

# Create a bag of words representation of the data (document-term matrix)
corpus = [dictionary.doc2bow(tokens) for tokens in df['preprocessed_text']]

# Run the LDA algorithm
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# Create a PDF to save the plot
with PdfPages('topics_and_top_words.pdf') as pdf:
    # Create subplots for each topic
    plt.figure(figsize=(10, 20))
    for topic_idx, topic in enumerate(lda_model.show_topics(num_topics=num_topics, num_words=n_top_words, formatted=False)):
        # Extract the top words and their probabilities
        top_words = [word for word, _ in topic[1]]
        top_word_probs = [prob for _, prob in topic[1]]

        # Plot the top words for the current topic
        plt.subplot(5, 2, topic_idx + 1)  # Assuming 10 topics
        plt.barh(top_words, top_word_probs, color='skyblue')
        plt.gca().invert_yaxis()  # Invert y-axis for better readability
        plt.title(f'Topic {topic_idx + 1}', fontsize=12)

    # Add space between subplots
    plt.tight_layout()

    # Save the plot to the PDF
    pdf.savefig()

    # Show the plot
    plt.show()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Define the number of top words to display for each topic
n_top_words = 7

# Get the average topic proportions
average_topic_proportions = [sum(prob for _, prob in topic) / n_top_words for _, topic in lda_model.show_topics(num_topics=num_topics, num_words=n_top_words, formatted=False)]
topic_labels = [f'Topic {i+1}' for i in range(len(average_topic_proportions))]

# Create a PDF to save the plot
with PdfPages('average_topic_proportions.pdf') as pdf:
    plt.figure(figsize=(10, 6))
    plt.bar(topic_labels, average_topic_proportions, color='skyblue')
    plt.title('Average Topic Proportions', fontsize=16)
    plt.xlabel('Topics', fontsize=14)
    plt.ylabel('Average Proportion', fontsize=14)
    plt.xticks(rotation=45, ha='right', fontsize=12)
    plt.yticks(fontsize=12)
    plt.tight_layout()
    pdf.savefig()
    plt.show()

In [None]:
# Define the topic number you want to search for
topic_number = 3

# Find documents associated with the specified topic
topic_documents = []

# Iterate through each document
for i, doc in enumerate(corpus):
    # Get the topic distribution for the current document
    topics_distribution = lda_model.get_document_topics(doc)

    # Check if the specified topic is one of the topics in the document
    for topic, score in topics_distribution:
        if topic == topic_number:
            # Append the document index and topic score to the list
            topic_documents.append((i, score))

# Sort the documents by their topic score
topic_documents.sort(key=lambda x: x[1], reverse=True)

# Print the top 2 documents associated with the specified topic
top_documents = 2  # Number of top documents to print
for i, score in topic_documents[:top_documents]:
    print(f"Document {i + 1} (Score: {score}): {df['Full URL'][i]} \n") # <= Changed from content to URL for r/Canada since they posted images/links/videos

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare the visualization data
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

# Display the intertopic distance map
pyLDAvis.display(vis_data)