In [None]:
import csv

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('edge_list.csv')

In [None]:
# Import the required library for plotting
import matplotlib.pyplot as plt

# Group the data by the 'Year' column
grouped_data = df.groupby('Year')

# Create a new figure for the plot with specified dimensions
plt.figure(figsize=(10, 6))

# Plot the data using a line plot with markers
# 'grouped_data['Year'].count()' calculates the number of posts for each year
plt.plot(grouped_data['Year'].count(), marker='o')

# Set labels for the x-axis and y-axis
plt.xlabel('Year')
plt.ylabel('Number of Posts')

# Set the title of the plot
plt.title('Number of Reddit Posts Over Time')

# Display grid lines on the plot
plt.grid(True)

# Display the plot
plt.show()

In [None]:
# Import the required library for plotting
import matplotlib.pyplot as plt

# Group the data by the 'Year' column
grouped_data = df.groupby('Year')

# Create a new figure for the plot with specified dimensions
plt.figure(figsize=(10, 6))

# Plot the average 'Upvote Count' over time with circular markers
plt.plot(grouped_data['Upvote Count'].mean(), marker='o', label='Average Upvote Count')

# Plot the average 'Comment Count' over time with circular markers
plt.plot(grouped_data['Comment Count'].mean(), marker='o', label='Average Comment Count')

# Plot the average 'Upvote Ratio' over time with circular markers
plt.plot(grouped_data['Upvote Ratio'].mean(), marker='o', label='Average Upvote Ratio')

# Set labels for the x-axis and y-axis
plt.xlabel('Year')
plt.ylabel('Metrics')

# Set the title of the plot
plt.title('Engagement Metrics Over Time')

# Display a legend to differentiate between the plotted metrics
plt.legend()

# Display grid lines on the plot
plt.grid(True)

# Display the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Plot the relationship between 'Type', 'Upvote Count', and 'Comment Count'
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))

# Box plot for 'Upvote Count' by 'Type'
sns.boxplot(x='Type', y='Upvote Count', data=df)
plt.title('Box Plot of Upvote Count by Type')
plt.xlabel('Type')
plt.ylabel('Upvote Count')
plt.show()

# Box plot for 'Comment Count' by 'Type'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Type', y='Comment Count', data=df)
plt.title('Box Plot of Comment Count by Type')
plt.xlabel('Type')
plt.ylabel('Comment Count')
plt.show()

In [None]:
# Scatter plot of 'Upvote Count' and 'Comment Count' colored by 'Type'
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Upvote Count', y='Comment Count', hue='Type', palette='Set1', s=100)
plt.title('Scatter Plot of Upvote Count vs. Comment Count')
plt.xlabel('Upvote Count')
plt.ylabel('Comment Count')
plt.legend(title='Type', loc='upper right')
plt.show()

In [None]:
# Group the data by the 'Year' column
grouped_data = df.groupby('Year')

# Create a new figure for the plot with specified dimensions
plt.figure(figsize=(10, 6))

# Plot the average sentiment polarity of posts over time with circular markers
plt.plot(grouped_data['Sentiment'].mean(), marker='o')

# Set labels for the x-axis and y-axis
plt.xlabel('Year')
plt.ylabel('Sentiment Polarity')

# Set the title of the plot
plt.title('Sentiment of Reddit Posts Time')

# Display grid lines on the plot
plt.grid(True)

# Display the plot
plt.show()


In [None]:
df['Post Flair'].unique()

In [None]:
# Step 1: Categorize Posts based on Flairs
# Create a new column 'Category' based on keywords in 'Post Flair'
def categorize_flair(flair):
    if isinstance(flair, str):
        flair = flair.lower()
        if 'environment' in flair:
            return 'Environmental'
        elif 'energy' in flair:
            return 'Energy'
        elif 'technology' in flair:
            return 'Technological'
    return 'Other'

df['Category'] = df['Post Flair'].apply(categorize_flair)

# Step 2: Compare Engagement Metrics across Flairs
engagement_metrics = df.groupby('Category').agg({
    'Upvote Count': 'mean',
    'Comment Count': 'mean',
    'Upvote Ratio': 'mean'
}).reset_index()

print(engagement_metrics)

# Step 3: Topic Modeling within each Flair Category
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Perform topic modeling for each flair category
for category in df['Category'].unique():
    corpus = df[df['Category'] == category]['Post Title'].tolist()

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # LDA Topic Modeling
    lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
    lda_model.fit(tfidf_matrix)

    # Get dominant topics and probabilities
    topics = []
    for idx, topic in enumerate(lda_model.components_):
        keywords = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]
        topics.append(', '.join(keywords))

    print(f"Topics for {category} category:")
    for idx, topic in enumerate(topics):
        print(f"Topic {idx + 1}: {topic}")


In [None]:
# Import the required libraries for plotting and visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Create a new figure for the plot with specified dimensions
plt.figure(figsize=(10, 6))

# Use Seaborn's barplot function to create a bar chart
# Set 'Category' as the x-axis variable and 'Upvote Count' as the y-axis variable
# Use the 'engagement_metrics' DataFrame as the data source
sns.barplot(x='Category', y='Upvote Count', data=engagement_metrics)

# Set label for the x-axis
plt.xlabel('Category')

# Set label for the y-axis
plt.ylabel('Average Upvote Count')

# Set the title of the plot
plt.title('Comparison of Average Upvote Count across Categories')

# Rotate the x-axis labels for better visibility
plt.xticks(rotation=45)

# Display grid lines on the plot
plt.grid(True)

# Display the plot
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Perform topic modeling for each flair category
for category in df['Category'].unique():
    corpus = df[df['Category'] == category]['Post Title'].tolist()

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # LDA Topic Modeling
    lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
    lda_model.fit(tfidf_matrix)

    # Get dominant topics and probabilities
    topics = []
    for idx, topic in enumerate(lda_model.components_):
        keywords = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]
        topics.append(', '.join(keywords))

    print(f"Topics for {category} category:")
    for idx, topic in enumerate(topics):
        print(f"Topic {idx + 1}: {topic}")

        # Generate word cloud for the topic
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(topic)

        # Plot the word cloud
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.title(f"Word Cloud for Topic {idx + 1} in {category} Category")
        plt.show()


In [None]:
# Initialize an empty matrix to store the topic-word frequency data
topic_word_matrix = []

# Iterate through each topic in the 'topics' list
for topic in topics:
    # Split the topic into individual words
    words = topic.split(', ')
    
    # Count the frequency of each word in the corpus and append to the matrix
    topic_word_matrix.append([corpus.count(word) for word in words])

# Create a DataFrame from the topic-word matrix and use words as column names
df_topic_word = pd.DataFrame(topic_word_matrix, columns=words)

# Create a new figure for the heatmap with specified dimensions
plt.figure(figsize=(10, 6))

# Use Seaborn's heatmap function to create a heatmap
# Use the 'df_topic_word' DataFrame as the data source
# Use the 'YlGnBu' color map, display annotations, and format annotations as integers
sns.heatmap(df_topic_word, cmap='YlGnBu', annot=True, fmt='d')

# Set the title of the heatmap
plt.title("Topic-Word Heatmap")

# Display the heatmap
plt.show()


In [None]:
# Calculate the count of each topic in the corpus and store in a list
topic_counts = [corpus.count(topic) for topic in topics]

# Calculate the total number of posts in the corpus
total_posts = len(corpus)

# Create a new figure for the pie chart with specified dimensions
plt.figure(figsize=(10, 6))

# Use the plt.pie function to create a pie chart
# Set the 'x' parameter to the list of topic counts, 'labels' to the list of topics, and 'autopct' to format percentage display
plt.pie(topic_counts, labels=topics, autopct='%1.1f%%')

# Set the title of the pie chart
plt.title("Topic Distribution")

# Display the pie chart
plt.show()


In [None]:
# Import the NetworkX library
import networkx as nx

# Create an empty graph using NetworkX
G = nx.Graph()

# Loop through each topic in the list of topics
for idx, topic in enumerate(topics):
    # Split the topic string into individual words
    words = topic.split(', ')
    
    # Loop through each word in the topic
    for word1 in words:
        # Loop through each word again
        for word2 in words:
            # Ensure that the two words are not the same
            if word1 != word2:
                # Add an edge between the two words in the graph
                G.add_edge(word1, word2)

# Create a new figure for the network visualization with specified dimensions
plt.figure(figsize=(10, 6))

# Use the nx.draw function to draw the network graph
# Set 'with_labels' to True to display labels on nodes, 'node_size' to set the size of nodes,
# 'node_color' to set the color of nodes, 'font_size' to set the size of node labels,
# and 'font_weight' to set the weight of node labels
nx.draw(G, with_labels=True, node_size=2000, node_color='skyblue', font_size=10, font_weight='bold')

# Set the title of the network visualization
plt.title("Word Network for Topics")

# Display the network visualization
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Scatter plot for Upvote Count vs. Sentiment
plt.scatter(df['Sentiment'], df['Upvote Count'])
plt.xlabel('Sentiment')
plt.ylabel('Upvote Count')
plt.title('Upvote Count vs. Sentiment')
plt.show()

# Bar plot for average Comment Count across different sentiment categories
sentiment_groups = df.groupby('Sentiment')['Comment Count'].mean().reset_index()
plt.bar(sentiment_groups['Sentiment'], sentiment_groups['Comment Count'])
plt.xlabel('Sentiment')
plt.ylabel('Average Comment Count')
plt.title('Average Comment Count for Different Sentiment Categories')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

corpus = df['Post Title'].tolist()

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)

# LDA Topic Modeling
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(tfidf_matrix)

# Get dominant topics and probabilities
topics = []
for idx, topic in enumerate(lda_model.components_):
    keywords = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]
    topics.append(', '.join(keywords))

# Add the dominant topic as a new column to the DataFrame
df['Dominant Topic'] = [topics[i] for i in lda_model.transform(tfidf_matrix).argmax(axis=1)]


In [None]:
# Option 1: Analyze Distribution across Years
topics_by_year = df.groupby('Year')['Dominant Topic'].value_counts().unstack(fill_value=0)
print(topics_by_year)

# Option 2: Analyze Distribution across Post Flairs
topics_by_flair = df.groupby('Post Flair')['Dominant Topic'].value_counts().unstack(fill_value=0)
print(topics_by_flair)

In [None]:
import matplotlib.pyplot as plt

# Option 1: Visualize Distribution across Years
topics_by_year.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Year')
plt.ylabel('Number of Posts')
plt.title('Distribution of Topics across Years')
plt.legend(title='Dominant Topic', loc='upper right')
plt.show()

# Option 2: Visualize Distribution across Post Flairs
topics_by_flair.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.xlabel('Post Flair')
plt.ylabel('Number of Posts')
plt.title('Distribution of Topics across Post Flairs')
plt.legend(title='Dominant Topic', loc='upper right')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Import the WordCloud library
from wordcloud import WordCloud

# Loop through each unique dominant topic in the DataFrame
for topic in df['Dominant Topic'].unique():
    # Concatenate all post titles associated with the current topic
    text = " ".join(df[df['Dominant Topic'] == topic]['Post Title'])
    
    # Generate a WordCloud using the concatenated text
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    # Create a new figure for the WordCloud visualization with specified dimensions
    plt.figure(figsize=(10, 6))
    
    # Display the WordCloud image using imshow
    plt.imshow(wordcloud, interpolation='bilinear')
    
    # Turn off axis labels
    plt.axis("off")
    
    # Set the title of the WordCloud visualization
    plt.title(f"Word Cloud for Topic: {topic}")
    
    # Display the WordCloud visualization
    plt.show()


In [None]:
import seaborn as sns

# Option 1: Heatmap for Distribution across Years
plt.figure(figsize=(10, 6))
sns.heatmap(topics_by_year, cmap='YlGnBu', annot=True, fmt='d')
plt.xlabel('Dominant Topic')
plt.ylabel('Year')
plt.title('Topic Distribution Heatmap across Years')
plt.show()

# Option 2: Heatmap for Distribution across Post Flairs
plt.figure(figsize=(12, 6))
sns.heatmap(topics_by_flair, cmap='YlGnBu', annot=True, fmt='d', cbar_kws={'label': 'Number of Posts'})
plt.xlabel('Dominant Topic')
plt.ylabel('Post Flair')
plt.title('Topic Distribution Heatmap across Post Flairs')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Group by post author and calculate the sum of upvotes and comments
user_engagement = df.groupby('Post Author').agg({
    'Upvote Count': 'sum',
    'Comment Count': 'sum'
}).reset_index()

# Calculate total engagement metric (e.g., sum of upvotes and comments)
user_engagement['Total Engagement'] = user_engagement['Upvote Count'] + user_engagement['Comment Count']

# Sort the users based on total engagement to find the most active ones
most_active_users = user_engagement.sort_values(by='Total Engagement', ascending=False).head(10)
print(most_active_users)


In [None]:
# Create a bar chart to visualize the number of posts made by each user
plt.figure(figsize=(10, 6))
plt.bar(most_active_users['Post Author'], most_active_users['Total Engagement'])
plt.xlabel('User')
plt.ylabel('Total Engagement')
plt.title('Top 10 Most Active Users')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Calculate the correlation between engagement metrics and sentiment
engagement_sentiment_corr = df[['Upvote Count', 'Comment Count', 'Sentiment']].corr()

# Create a heatmap to visualize the correlation
plt.figure(figsize=(8, 6))
sns.heatmap(engagement_sentiment_corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation between User Engagement Metrics and Sentiment')
plt.show()


In [None]:
# Extract posts made by the most active users
most_active_user_posts = df[df['Post Author'].isin(most_active_users['Post Author'])]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Prepare the corpus for topic modeling
corpus = most_active_user_posts['Post Title'].tolist()

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)


# LDA Topic Modeling
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(tfidf_matrix)


# Get dominant topics and probabilities
topics = []
for idx, topic in enumerate(lda_model.components_):
    keywords = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]]
    topics.append(', '.join(keywords))

# Add the topics to the most_active_users DataFrame
most_active_users['Dominant Topics'] = topics
print(most_active_users)


In [None]:
# Create a bar chart to visualize the dominant topics for each user
plt.figure(figsize=(12, 6))
for index, row in most_active_users.iterrows():
    topics = row['Dominant Topics']
    plt.barh(row['Post Author'], row['Total Engagement'], label=topics)

plt.xlabel('Total Engagement')
plt.ylabel('User')
plt.title('Top 10 Most Active Users and their Dominant Topics')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Create a bar plot to compare average upvote counts across categories
sns.barplot(x='Category', y='Upvote Count', data=df)
plt.title('Average Upvote Count across Categories')
plt.xlabel('Category')
plt.ylabel('Average Upvote Count')
plt.show()

# Group the data by year and count the number of posts in each year
posts_by_year = df.groupby('Year')['Post ID'].count().reset_index()
posts_by_year = posts_by_year.rename(columns={'Post ID': 'Post Count'})

# Create a line plot to visualize the number of posts over time
sns.lineplot(x='Year', y='Post Count', data=posts_by_year)
plt.title('Number of Posts over Time')
plt.xlabel('Year')
plt.ylabel('Number of Posts')
plt.show()


In [None]:
# Create a heatmap to visualize correlation between upvote count and comment count
correlation_matrix = df[['Upvote Count', 'Comment Count']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap: Upvote Count vs. Comment Count')
plt.show()
