#### What is the dominant sentiment (positive, negative, neutral) expressed in comments on Tally proposal posts?

In [51]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter Tally proposal posts
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to comments
tally_posts['Sentiment'] = tally_posts['Post Description'].astype(str).apply(get_sentiment)

# Count the occurrences of each sentiment
sentiment_counts = tally_posts['Sentiment'].value_counts().reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']

# Create an interactive pie chart using Plotly
fig = px.pie(sentiment_counts, values='Count', names='Sentiment', 
             title='Distribution of Sentiments in Comments on Tally Proposal Posts')
fig.show()
fig.write_html("Dominant Sentiment in Comments on Tally Proposal Posts.html")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### Are there any trends in sentiment (positive, negative) towards Tally proposals over time (consider daily/weekly averages)?

In [52]:
import pandas as pd
import plotly.express as px
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load datasets
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter posts related to Tally proposal posts
tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    sentiment = sid.polarity_scores(text)
    if sentiment['compound'] > 0.05:
        return 'Positive'
    elif sentiment['compound'] < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to comments
tally_posts['Sentiment'] = tally_posts['Post Description'].astype(str).apply(get_sentiment)

# Convert 'Post Created At' to datetime
tally_posts['Post Created At'] = pd.to_datetime(tally_posts['Post Created At'])

# Extract date and time components
tally_posts['Date'] = tally_posts['Post Created At'].dt.date

# Group by date and sentiment, and count the number of comments
sentiment_counts = tally_posts.groupby(['Date', 'Sentiment']).size().unstack(fill_value=0)

# Create a heatmap
fig = px.imshow(sentiment_counts.T, 
                labels=dict(x='Date', y='Sentiment', color='Number of Comments'), 
                x=sentiment_counts.index, 
                y=sentiment_counts.columns,
                color_continuous_scale='RdYlGn',
                title='Sentiment of Comments on Tally Proposal Posts Over Time')

# Update layout
fig.update_layout(xaxis_title='Date', yaxis_title='Sentiment', coloraxis_colorbar=dict(title='Number of Comments'))

# Show the plot
fig.show()
fig.write_html("Trends in Sentiment Towards Tally Proposals Over Time.html")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



### Does the sentiment towards Tally proposals differ significantly by other proposals?

In [53]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px
import scipy.stats as stats

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter Tally proposal posts
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Filter other proposal posts
other_posts = posts_df[~posts_df['Topic ID'].isin(tally_topic_ids)]

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to comments
tally_posts['Sentiment'] = tally_posts['Post Description'].astype(str).apply(get_sentiment)
other_posts['Sentiment'] = other_posts['Post Description'].astype(str).apply(get_sentiment)

# Calculate sentiment distributions
tally_sentiment_counts = tally_posts['Sentiment'].value_counts(normalize=True)
other_sentiment_counts = other_posts['Sentiment'].value_counts(normalize=True)

# Perform statistical test (e.g., Chi-square test) to determine if there's a significant difference
chi2_stat, p_val, _, _ = stats.chi2_contingency([tally_sentiment_counts, other_sentiment_counts])

# Print results
print("Chi-square statistic:", chi2_stat)
print("P-value:", p_val)

# Plot sentiment distributions
fig = px.bar(pd.concat([tally_sentiment_counts, other_sentiment_counts], axis=1, keys=['Tally Proposals', 'Other Proposals']),
             barmode='group', title='Sentiment Distributions for Tally Proposals vs Other Proposals')
fig.show()
fig.write_html("Sentiment Towards Tally Proposals vs Other Proposals.html")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Chi-square statistic: 0.015343046918830032
P-value: 0.9923578275730098




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### Do high "Trust Level" users express a different sentiment compared to regular users for Tally proposals?

In [54]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px
import scipy.stats as stats

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')
users_df = pd.read_csv('forum_users_data.csv')

# Filter Tally proposal posts
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Merge with users dataframe to get trust levels
tally_posts = tally_posts.merge(users_df[['Username', 'Trust Level']], how='left', left_on='Username', right_on='Username')

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to comments
tally_posts['Sentiment'] = tally_posts['Post Description'].astype(str).apply(get_sentiment)

# Calculate sentiment distributions for high trust level users and regular users
trust_level_sentiment_counts = tally_posts.groupby('Trust Level')['Sentiment'].value_counts(normalize=True).reset_index(name='Count')

# Plot sentiment distributions
fig = px.bar(trust_level_sentiment_counts, x='Trust Level', y='Count', color='Sentiment',
             barmode='group', title='Sentiment Distributions for Trust Levels on Tally Proposals')
fig.show()
fig.write_html("Sentiment of High Trust Level Users vs Regular Users.html")


#### Does the sentiment towards Tally proposals change as they age on the forum (consider comparing sentiment for newly posted vs. older proposals)?

In [55]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter Tally proposal posts
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Convert 'Post Created At' to datetime
tally_posts['Post Created At'] = pd.to_datetime(tally_posts['Post Created At'])

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to comments
tally_posts['Sentiment'] = tally_posts['Post Description'].astype(str).apply(get_sentiment)

# Categorize proposals into different age groups (e.g., newly posted vs. older)
# You can adjust the age groups as per your requirement
tally_posts['Age Group'] = pd.cut((pd.Timestamp.now() - tally_posts['Post Created At']).dt.days, bins=[0, 30, 60, 90, 180, 365, float('inf')],
                                  labels=['0-30 days', '31-60 days', '61-90 days', '91-180 days', '181-365 days', '365+ days'])

# Calculate sentiment distributions for each age group
sentiment_distribution = tally_posts.groupby(['Age Group', 'Sentiment']).size().reset_index(name='Count')

# Plot sentiment distribution over time using line plot
fig = px.line(sentiment_distribution, x='Age Group', y='Count', color='Sentiment', 
              title='Sentiment Distribution of Tally Proposals Over Time')
fig.show()
fig.write_html("Change in Sentiment Over Time for Tally Proposals.html")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





#### Is there a correlation between the sentiment expressed in comments and the number of views a Tally proposal receives?

In [64]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter Tally proposal posts
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Merge with topics dataframe to get 'Views' information
tally_posts = tally_posts.merge(topics_df[['Topic ID', 'Views']], how='left', left_on='Topic ID', right_on='Topic ID')

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    return scores['compound']

# Apply sentiment analysis to comments
tally_posts['Sentiment'] = tally_posts['Post Description'].astype(str).apply(get_sentiment)

# Calculate the correlation between sentiment and number of views
correlation = tally_posts[['Sentiment', 'Views']].corr().iloc[0, 1]

# Plot scatter plot with sentiment vs. number of views
fig = px.scatter(tally_posts, x='Sentiment', y='Views', 
                 title=f'Correlation between Sentiment and Number of Views,\nCorrelation Coefficient: {correlation:.2f}')
fig.show()
fig.write_html("Correlation Between Sentiment and Number of Views.html")


#### Is there a correlation between the sentiment expressed in comments and the number of likes a Tally proposal receives?

In [65]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter Tally proposal posts
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Merge with topics dataframe to get 'Likes' information
tally_posts = tally_posts.merge(topics_df[['Topic ID', 'Like Count']], how='left', left_on='Topic ID', right_on='Topic ID')

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    return scores['compound']

# Apply sentiment analysis to comments
tally_posts['Sentiment'] = tally_posts['Post Description'].astype(str).apply(get_sentiment)

# Calculate the correlation between sentiment and number of likes
correlation = tally_posts[['Sentiment', 'Like Count']].corr().iloc[0, 1]

# Plot scatter plot with sentiment vs. number of likes
fig = px.scatter(tally_posts, x='Sentiment', y='Like Count', 
                 title=f'Correlation between Sentiment and Number of Likes,\nCorrelation Coefficient: {correlation:.2f}')
fig.show()
fig.write_html("Correlation Between Sentiment and Number of Likes.html")


#### For proposals with the highest number of comments, what is the overall sentiment expressed?    

In [67]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter Tally proposal posts
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Group by Topic ID and count number of comments
comment_counts = tally_posts.groupby('Topic ID').size().reset_index(name='Comment Count')

# Get Topic IDs with the highest number of comments
max_comments_topics = comment_counts[comment_counts['Comment Count'] == comment_counts['Comment Count'].max()]['Topic ID']

# Filter posts for proposals with the highest number of comments
max_comments_posts = tally_posts[tally_posts['Topic ID'].isin(max_comments_topics)]

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    return scores['compound']

# Apply sentiment analysis to comments
max_comments_posts['Sentiment'] = max_comments_posts['Post Description'].astype(str).apply(get_sentiment)

# Visualize sentiment distribution using histogram
fig1 = px.histogram(max_comments_posts, x='Sentiment', 
                    title='Distribution of Sentiment Scores for Proposals with the Highest Number of Comments',
                    labels={'Sentiment': 'Sentiment Score', 'count': 'Number of Comments'})
fig.show()

# Visualize sentiment distribution using box plot
fig = px.box(max_comments_posts, y='Sentiment', 
             title='Distribution of Sentiment Scores for Proposals with the Highest Number of Comments',
             labels={'Sentiment': 'Sentiment Score'})
fig.show()
fig1.write_html("Overall Sentiment for Proposals with the Highest Number of Comments 1.html")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#### What is the average number of comments received by Tally proposal posts?

In [59]:
import pandas as pd
import plotly.express as px

# Load datasets
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter Tally proposal posts
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Calculate the average number of comments received by Tally proposal posts
average_comments = tally_posts.groupby('Topic ID').size().mean()

# Plot the average number of comments received by Tally proposal posts
fig = px.bar(x=['Average Comments'], y=[average_comments], title='Average Number of Comments per Tally Proposal Post')
fig.show()
fig.write_html("Average Number of Comments Received by Tally Proposal Posts.html")


####  Is there a statistically significant correlation between the number of comments and the number of likes for Tally proposals?

In [60]:
import pandas as pd
from scipy.stats import pearsonr
import plotly.express as px

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')

# Filter Tally proposal topics
tally_topic_ids = [21332, 20957, 20856, 20223, 20064, 19899, 19467, 16501, 19696, 18557, 
                   19046, 16131, 15615, 15920, 15425, 15426, 14976, 14688, 14790, 13360, 13362]

tally_topics = topics_df[topics_df['Topic ID'].isin(tally_topic_ids)]

# Calculate Pearson correlation coefficient between 'Posts Count' and 'Like Count'
correlation_coefficient, p_value = pearsonr(tally_topics['Posts Count'], tally_topics['Like Count'])

# Check for statistical significance
alpha = 0.05
if p_value < alpha:
    significance = 'statistically significant'
else:
    significance = 'not statistically significant'

print(f"Pearson Correlation Coefficient: {correlation_coefficient:.2f}")
print(f"P-value: {p_value:.2f}")
print(f"The correlation between number of comments and number of likes for Tally proposals is {significance}.")

# Create scatter plot
fig = px.scatter(tally_topics, x='Posts Count', y='Like Count', 
                 title='Correlation between Number of Comments and Number of Likes for Tally Proposals',
                 labels={'Posts Count': 'Number of Comments', 'Like Count': 'Number of Likes'})

# Show the plot
fig.show()
fig.write_html("Correlation Between Number of Comments and Number of Likes.html")


Pearson Correlation Coefficient: 0.94
P-value: 0.00
The correlation between number of comments and number of likes for Tally proposals is statistically significant.


#### Do proposals with a higher number of views tend to have a significantly higher or lower number of comments compared to proposals with fewer views?

In [61]:
import pandas as pd
from scipy.stats import ttest_ind
import plotly.express as px

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')

# Calculate the median number of views
median_views = topics_df['Views'].median()

# Split the proposals into two groups based on views
high_views_proposals = topics_df[topics_df['Views'] > median_views]
low_views_proposals = topics_df[topics_df['Views'] <= median_views]

# Perform independent samples t-test
t_statistic, p_value = ttest_ind(high_views_proposals['Posts Count'], low_views_proposals['Posts Count'])

# Determine the significance level
alpha = 0.05
if p_value < alpha:
    significance = 'statistically significant'
else:
    significance = 'not statistically significant'

# Create a violin plot
data = pd.concat([high_views_proposals.assign(Views='High Views'), 
                  low_views_proposals.assign(Views='Low Views')])

fig = px.violin(data, x='Views', y='Posts Count', 
                title='Distribution of Comments for Proposals with High and Low Views',
                labels={'Posts Count': 'Number of Comments', 'Views': 'View Group'},
                box=True, points="all")

# Add significance information to the plot
fig.add_annotation(text=f"P-value: {p_value:.2f}\n{significance} difference", xref='paper', yref='paper',
                   x=0.95, y=0.95, showarrow=False, align='right')

# Show the plot
fig.show()
fig.write_html("Relationship Between Views and Comments for Tally Proposals.html")


#### Who are the top 10 most active commenters on Tally proposal posts (based on the number of comments)?

In [62]:
import pandas as pd
import plotly.express as px

# Load dataset
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter posts related to Tally proposal posts
tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Group by username and count the number of comments
comment_counts = tally_posts['Username'].value_counts()

# Get the top 10 most active commenters
top_10_commenters = comment_counts.head(10)

# Create a dataframe for the top 10 commenters
top_10_df = pd.DataFrame({'Username': top_10_commenters.index, 'Number of Comments': top_10_commenters.values})

# Print the top 10 most active commenters with their comment counts
print("Top 10 Most Active Commenters on Tally Proposal Posts:")
print(top_10_df)

# Create an interactive bar plot using Plotly
fig = px.bar(top_10_df, x='Username', y='Number of Comments', 
             title='Top 10 Most Active Commenters on Tally Proposal Posts',
             labels={'Username': 'Usernames', 'Number of Comments': 'Number of Comments'},
             color='Username')

# Customize layout
fig.update_layout(xaxis={'categoryorder':'total descending'}, 
                  xaxis_title='Usernames', yaxis_title='Number of Comments',
                  showlegend=False)

# Show the plot
fig.show()
fig.write_html("Top 10 Most Active Commenters on Tally Proposal Posts.html")


Top 10 Most Active Commenters on Tally Proposal Posts:
          Username  Number of Comments
0    DisruptionJoe                  43
1             krst                  38
2          Saurabh                  28
3  Immutablelawyer                  28
4            tnorm                  27
5             fred                  24
6           cattin                  24
7        Bob-Rossi                  18
8             cp0x                  15
9     stonecoldpat                  14


#### Do users with high "Trust Level" tend to contribute a significantly higher or lower number of comments on average compared to regular users for Tally proposals?

In [63]:
import pandas as pd
import plotly.express as px

# Load datasets
users_df = pd.read_csv('forum_users_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter posts related to Tally proposal posts
tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Merge posts with user trust levels
merged_df = tally_posts.merge(users_df[['Username', 'Trust Level']], 
                              how='left', left_on='Username', right_on='Username')

# Group by trust level and calculate average number of comments
avg_comments_by_trust_level = merged_df.groupby('Trust Level')['Username'].count() / merged_df['Trust Level'].value_counts()

# Create a pie chart to visualize average comments by trust level
fig = px.pie(values=avg_comments_by_trust_level.values, 
             names=avg_comments_by_trust_level.index,
             title='Average Number of Comments by Trust Level for Tally Proposals',
             color_discrete_sequence=px.colors.qualitative.Set3)

# Customize layout
fig.update_traces(textinfo='percent+label', pull=[0.05] * len(avg_comments_by_trust_level))

# Show the plot
fig.show()
fig.write_html("Contribution of High Trust Level Users vs Regular Users.html")


#### Relationship between the length of Tally proposal titles and the sentiment expressed in the comments?

In [50]:
import pandas as pd
import plotly.express as px
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Load datasets
topics_df = pd.read_csv('forum_topics_data.csv')
posts_df = pd.read_csv('forum_posts_data.csv')

# Filter posts related to Tally proposal posts
tally_posts = posts_df[posts_df['Topic ID'].isin(tally_topic_ids)]

# Initialize SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Function to determine sentiment polarity
def get_sentiment(text):
    sentiment = sid.polarity_scores(text)
    if sentiment['compound'] > 0.05:
        return 'Positive'
    elif sentiment['compound'] < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to comments
tally_posts['Sentiment'] = tally_posts['Post Description'].astype(str).apply(get_sentiment)

# Merge with topics dataframe to get proposal titles
tally_posts = tally_posts.merge(topics_df[['Topic ID', 'Title']], on='Topic ID')

# Calculate the length of proposal titles
tally_posts['Title Length'] = tally_posts['Title'].apply(len)

# Create a scatter plot
fig = px.scatter(tally_posts, x='Title Length', y='Sentiment', 
                 title='Relationship Between Title Length and Comment Sentiment for Tally Proposals',
                 color='Sentiment', hover_data=['Title'],
                 category_orders={'Sentiment': ['Positive', 'Neutral', 'Negative']},
                 labels={'Title Length': 'Title Length', 'Sentiment': 'Sentiment of Comments'})

# Customize layout
fig.update_traces(marker=dict(size=10))

# Show the plot
fig.show()
fig.write_html("")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

