In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
import spacy
import re

In [None]:
preprocessed_reddit_posts = pd.read_csv('preprocessed_reddit_posts.csv')
preprocessed_twitter_tweets = pd.read_csv('preprocessed_twitter_tweets.csv')

In [None]:
# Summary statistics for Reddit data
preprocessed_reddit_posts.describe()

In [None]:
# Information about Reddit posts
preprocessed_reddit_posts.info()

In [None]:
# Summary statistics for Twitter tweets
preprocessed_twitter_tweets.describe()

In [None]:
# Info about Twitter tweets data
preprocessed_twitter_tweets.info()

In [None]:
plt.hist(preprocessed_reddit_posts['score'])
plt.title('Distribution of Reddit post scores')
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()

In [None]:
plt.scatter(preprocessed_reddit_posts['score'], preprocessed_reddit_posts['num_comments'])
plt.title('Reddit post scores vs. number of comments')
plt.xlabel('Score')
plt.ylabel('Number of comments')
plt.show()

In [None]:
# Concatenate the title and body columns from the reddit posts df with the tweet_text column from the twitter tweets df
merged_df = pd.concat([preprocessed_reddit_posts["title"] + " " + preprocessed_reddit_posts["body"], preprocessed_twitter_tweets["tweet_text"]])

# Define a regular expression pattern to match unwanted characters
unwanted_pattern = r"[!@&\\.;:,/\|()_{}\"\'\[\]]"

In [None]:
word_counter = Counter()
for text in merged_df:
  # Use regular expressions to remove unwanted characters
  text_words = re.sub(unwanted_pattern, "", text)
  # Split the text into individual words
  text_words = text_words.split()
  # Exclude words that are too short
  text_words = [w for w in text_words if len(w) > 3]
  # Exclude unwanted words
  text_words = [w for w in text_words if w not in ["amp"]]
  word_counter.update(text_words)

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(width=800, height=400)
cloud.generate_from_frequencies(dict(word_counter.most_common(300)))
image = cloud.to_image()
image.save("wordcloud.png")

In [None]:
from wordcloud import WordCloud
from PIL import Image
import numpy as np

# Load logo image
logo_mask = np.array(Image.open("elephant.png"))

# Create WordCloud object
cloud = WordCloud(width=1000, height=400, mask=logo_mask)

# Generate wordcloud from word frequencies
cloud.generate_from_frequencies(dict(word_counter.most_common(300)))

# Convert to image
image = cloud.to_image()

# Save image
image.save("wordcloud1.png")

In [None]:
merged_df.to_csv('merged_df.csv')