In [None]:
#pip install pandas
from bs4 import BeautifulSoup
from textblob import TextBlob  # for sentiment analysis

## Data Cleanning

In [None]:
import pandas as pd
import numpy as np

In [None]:
path ="/content/drive/MyDrive/Kaggle/ForumMessages.csv"

df = pd.read_csv(path)
df.head(10)

In [None]:
np.random.seed(42)

In [None]:
select_df_1 = df[['PostDate','Message']]
#select_df = select_df_1.sample(n=20000, random_state=42)
select_df = select_df_1
#select_df = select_df[:1000]
select_df.head(100)


In [None]:
# Define lists to store sentiment polarities and PostDate
sentiments = []
post_dates = []

In [None]:
# Iterate through each row in select_df
for index, row in select_df.iterrows():
    text = row['Message']

    # Convert the text to string if it's not already
    if not isinstance(text, str):
        text = str(text)

    # Remove HTML tags
    clean_text = BeautifulSoup(text, "html.parser").get_text()

    # Perform sentiment analysis
    sentiment = TextBlob(clean_text).sentiment.polarity

    # Append sentiment polarity to the sentiments list
    sentiments.append(sentiment)

    # Append the corresponding PostDate to the post_dates list
    post_dates.append(row['PostDate'])

# Create a DataFrame to store the results
sentiment_df = pd.DataFrame({'PostDate': post_dates, 'Message': select_df['Message'], 'Sentiment': sentiments})

# Display the first 100 rows of the DataFrame with the Sentiment column
print(sentiment_df.head(100))

In [None]:
# Convert 'PostDate' column to datetime format
sentiment_df['PostDate'] = pd.to_datetime(sentiment_df['PostDate'], format='%m/%d/%Y %H:%M:%S')

# Sort the DataFrame by the 'PostDate' column
sentiment_df_sorted = sentiment_df.sort_values(by='PostDate')

# Display the first 100 rows of the sorted DataFrame
print(sentiment_df_sorted)

In [None]:
import matplotlib.pyplot as plt

# Assuming sentiment_df_sorted is your sorted DataFrame containing the 'PostDate' and 'Sentiment' columns

# Plot the sentiment analysis
plt.figure(figsize=(10, 6))
plt.plot(sentiment_df_sorted['PostDate'], sentiment_df_sorted['Sentiment'], marker='o', linestyle='-')
plt.title('Sentiment Analysis Over Time')
plt.xlabel('Date')
plt.ylabel('Sentiment Polarity')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming sentiment_df_sorted is your sorted DataFrame containing the 'PostDate' and 'Sentiment' columns

# Plot the sentiment analysis
plt.figure(figsize=(10, 6))

# Plot sentiment polarity >= 0 in green
plt.plot(sentiment_df_sorted['PostDate'][sentiment_df_sorted['Sentiment'] >= 0],
         sentiment_df_sorted['Sentiment'][sentiment_df_sorted['Sentiment'] >= 0],
         marker='o', linestyle='-', color='green', label='Non-negative Sentiment')

# Plot sentiment polarity < 0 in red
plt.plot(sentiment_df_sorted['PostDate'][sentiment_df_sorted['Sentiment'] < 0],
         sentiment_df_sorted['Sentiment'][sentiment_df_sorted['Sentiment'] < 0],
         marker='o', linestyle='-', color='red', label='Negative Sentiment')

plt.title('Sentiment Analysis Over Time')
plt.xlabel('Date')
plt.ylabel('Sentiment Polarity')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming sentiment_df_sorted is your sorted DataFrame containing the 'PostDate' and 'Sentiment' columns

# Extract year from the 'PostDate' column
sentiment_df_sorted['Year'] = sentiment_df_sorted['PostDate'].dt.year

# Group by year and calculate the average sentiment polarity
average_sentiment = sentiment_df_sorted.groupby('Year')['Sentiment'].mean()

# Plot the average sentiment analysis
plt.figure(figsize=(10, 6))

# Plot positive sentiment in green
positive_sentiment = average_sentiment[average_sentiment >= 0.20]
plt.plot(positive_sentiment.index, positive_sentiment.values, marker='o', linestyle='-', color='green', label='Positive Sentiment')

# Plot negative sentiment in red
negative_sentiment = average_sentiment[average_sentiment < 0.20]
plt.plot(negative_sentiment.index, negative_sentiment.values, marker='o', linestyle='-', color='red', label='Negative Sentiment')

plt.title('Average Sentiment Analysis Over Time')
plt.xlabel('Year')
plt.ylabel('Average Sentiment Polarity')
plt.xticks(average_sentiment.index)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming sentiment_df_sorted is your sorted DataFrame containing the 'PostDate' and 'Sentiment' columns

# Extract year from the 'PostDate' column
sentiment_df_sorted['Year'] = sentiment_df_sorted['PostDate'].dt.year

# Group by year and calculate the average sentiment polarity for positive and negative sentiments separately
average_positive_sentiment = sentiment_df_sorted[sentiment_df_sorted['Sentiment'] >= 0].groupby('Year')['Sentiment'].mean()
average_negative_sentiment = sentiment_df_sorted[sentiment_df_sorted['Sentiment'] < 0].groupby('Year')['Sentiment'].mean()

# Plot the average sentiment analysis
plt.figure(figsize=(10, 6))

# Plot average positive sentiment in green
plt.plot(average_positive_sentiment.index, average_positive_sentiment.values, marker='o', linestyle='-', color='green', label='Average Positive Sentiment')

# Plot average negative sentiment in red
plt.plot(average_negative_sentiment.index, average_negative_sentiment.values, marker='o', linestyle='-', color='red', label='Average Negative Sentiment')

plt.title('Average Sentiment Analysis Over Time')
plt.xlabel('Year')
plt.ylabel('Average Sentiment Polarity')
plt.xticks(average_positive_sentiment.index)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Plot bar chart for sentiment distribution
plt.figure(figsize=(10, 6))

# Plot positive sentiment distribution
plt.bar(average_positive_sentiment.index, average_positive_sentiment.values, color='green', label='Positive Sentiment')

# Plot negative sentiment distribution
plt.bar(average_negative_sentiment.index, average_negative_sentiment.values, color='red', label='Negative Sentiment')

plt.title('Sentiment Distribution Over Time')
plt.xlabel('Year')
plt.ylabel('Average Sentiment Polarity')
plt.xticks(average_positive_sentiment.index)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Plot box plot for sentiment variation
plt.figure(figsize=(10, 6))

# Create a box plot for positive sentiment
plt.boxplot(sentiment_df_sorted[sentiment_df_sorted['Sentiment'] >= 0].groupby('Year')['Sentiment'].apply(list), positions=average_positive_sentiment.index, patch_artist=True, boxprops=dict(facecolor='lightgreen'), showmeans=True)

# Create a box plot for negative sentiment
plt.boxplot(sentiment_df_sorted[sentiment_df_sorted['Sentiment'] < 0].groupby('Year')['Sentiment'].apply(list), positions=average_negative_sentiment.index, patch_artist=True, boxprops=dict(facecolor='lightcoral'), showmeans=True)

plt.title('Sentiment Variation Over Time')
plt.xlabel('Year')
plt.ylabel('Sentiment Polarity')
plt.xticks(average_positive_sentiment.index)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))

# Create a box plot for positive sentiment (without outliers)
plt.boxplot(sentiment_df_sorted[sentiment_df_sorted['Sentiment'] >= 0].groupby('Year')['Sentiment'].apply(list), positions=average_positive_sentiment.index, patch_artist=True, boxprops=dict(facecolor='lightgreen'), showmeans=True, showfliers=False)

# Create a box plot for negative sentiment (without outliers)
plt.boxplot(sentiment_df_sorted[sentiment_df_sorted['Sentiment'] < 0].groupby('Year')['Sentiment'].apply(list), positions=average_negative_sentiment.index, patch_artist=True, boxprops=dict(facecolor='lightcoral'), showmeans=True, showfliers=False)

plt.title('Sentiment Variation Over Time')
plt.xlabel('Year')
plt.ylabel('Sentiment Polarity')
plt.xticks(average_positive_sentiment.index)
plt.grid(True)
plt.tight_layout()
plt.show()


## Plot


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming sentiment_df_sorted is your sorted DataFrame containing the 'PostDate' and 'Sentiment' columns

# Extract year from the 'PostDate' column
sentiment_df_sorted['Year'] = sentiment_df_sorted['PostDate'].dt.year

# Group by year and calculate the average sentiment polarity
average_sentiment = sentiment_df_sorted.groupby('Year')['Sentiment'].mean()

# Plot the average sentiment analysis
plt.figure(figsize=(10, 6))
plt.plot(average_sentiment.index, average_sentiment.values, marker='o', linestyle='-')
plt.title('Average Sentiment Analysis Over Time')
plt.xlabel('Year')
plt.ylabel('Average Sentiment Polarity')
plt.xticks(average_sentiment.index)
plt.grid(True)

# Highlight positive sentiment in green and negative sentiment in red
positive_sentiment = average_sentiment[average_sentiment >= 0.25]
negative_sentiment = average_sentiment[average_sentiment < 0.25]

plt.scatter(positive_sentiment.index, positive_sentiment.values, color='green', label='Positive Sentiment')
plt.scatter(negative_sentiment.index, negative_sentiment.values, color='red', label='Negative Sentiment')
plt.legend()

plt.tight_layout()
plt.show()
