In [None]:
!pip install praw

In [None]:
import praw
import pandas as pd
import datetime
# Initialize PRAW with your Reddit API credentials
reddit = praw.Reddit(client_id='',
                     client_secret='',
                     user_agent='')

In [None]:
# Choose the subreddit you want to scrape
subreddit = reddit.subreddit('canadahousing')

# Retrieve the top 100 posts from the subreddit
newest_posts = subreddit.new(limit=100)

# Create an empty list to store post data
data = []

for post in newest_posts:
        post_data = {
            "Title": post.title,
            "Score": post.score,
            "Content": post.selftext if post.selftext else None,
            "Number of Comments": post.num_comments,
             "Post URL": post.url,
            "Full URL": f"https://www.reddit.com{post.permalink}",
            "Date": datetime.datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        }
        data.append(post_data)

df_new_posts = pd.DataFrame(data)

# Print 10 observations from the DataFrame
print(df_new_posts.head(10))

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud  # Library for generating word clouds
import matplotlib.pyplot as plt  # Library for plotting graphs

text = ' '.join(df_new_posts['Title'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

plt.title('Word Cloud representing r/canadahousing post titles')

plt.show()

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

In [None]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to classify sentiment
def classify_sentiment(text):
    sentiment = analyzer.polarity_scores(text)['compound']
    if sentiment < 0:
        return 'Negative'
    elif sentiment <= 0.3:
        return 'Neutral'
    else:
        return 'Positive'

# Apply sentiment classification to post titles
df_new_posts['Sentiment Classification'] = df_new_posts['Title'].apply(classify_sentiment)

# Print 10 observations
print(df_new_posts[['Title', 'Sentiment Classification']].head(10))

In [None]:
colors = {'Positive': 'green', 'Negative': 'red', 'Neutral': 'blue'}

plt.figure(figsize=(8, 6))
for sentiment, color in colors.items():
    subset = df_new_posts[df_new_posts['Sentiment Classification'] == sentiment]
    plt.scatter(subset['Number of Comments'], subset['Score'], color=color, label=sentiment, alpha=0.5)
plt.title('Relationship between Number of Comments and Score (Colored by Sentiment)')
plt.xlabel('Number of Comments')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Second Dataframe
# Define the subreddit
subreddit = reddit.subreddit('canada')

# Retrieve the top 1000 posts from the subreddit
newest_posts = subreddit.new(limit=1000)

# Create an empty list to store post data
data = []

for post in newest_posts:
    # Check if the post title or content contains relevant keywords (e.g., 'housing')
    if 'housing' in post.title.lower() or 'housing' in post.selftext.lower():
        post_data = {
            "Title": post.title,
            "Score": post.score,
            "Content": post.selftext if post.selftext else None,
            "Number of Comments": post.num_comments,
            "Post URL": post.url,
            "Full URL": f"https://www.reddit.com{post.permalink}",
            "Date": datetime.datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        }
        data.append(post_data)

# Create DataFrame from the filtered data
df = pd.DataFrame(data)

# Print 10 observations from the DataFrame
print(df)

In [None]:
text = ' '.join(df['Title'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

plt.title('Word Cloud representing r/canada post titles')

plt.show()

In [None]:
# Apply sentiment classification to post titles
df['Sentiment Classification'] = df['Title'].apply(classify_sentiment)

# Print 10 observations
print(df[['Title', 'Sentiment Classification']].head(10))

In [None]:
plt.figure(figsize=(8, 6))
for sentiment, color in colors.items():
    subset = df[df['Sentiment Classification'] == sentiment]
    plt.scatter(subset['Number of Comments'], subset['Score'], color=color, label=sentiment, alpha=0.5)
plt.title('Relationship between Number of Comments and Score (Colored by Sentiment)')
plt.xlabel('Number of Comments')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()