# ============================
# 📊 Reddit Sentiment Analysis for ChatGPT
# ============================
**Srcond Project of the Data Analytics Training Program**
*In partnership with Brainwave Matrix Solutions - 2025*
##### Project Goal: To understand public sentiment towards ChatGPT across Reddit.
##### Tools Used: Python, Libraries: pandas, requests, praw, nltk, matplotlib, seaborn, wordcloud, TextBlob

---------------------------------------
# SECTION 1: Import libraries & Environment Preparation
---------------------------------------

# 1.1 Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
!pip install praw python-dotenv

# 1.2 API Setup and Authentication

### - Environment Variables Configuration

In [None]:
import os
from dotenv import load_dotenv  
# Load the variables from the .env file
load_dotenv()  

# Retrieve Reddit API credentials from environment variables
client_id = os.getenv("REDDIT_CLIENT_ID")
client_secret = os.getenv("REDDIT_CLIENT_SECRET")
user_agent = os.getenv("REDDIT_USER_AGENT")

# Validate credentials
if not client_id or not client_secret:
    raise ValueError("❌ Missing Reddit API credentials. Please check your .env file")
else:
    print("✅ Reddit API credentials loaded successfully")

### - Initialize Reddit instance

In [None]:
import praw
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

In [None]:
import logging
# Disable warnings from praw library
logging.getLogger("praw").setLevel(logging.CRITICAL)
# Test connection with simple code: get subreddit name and number of members
sub = reddit.subreddit('technology')
print(f"Subreddit: {sub.display_name}, Members: {sub.subscribers}")

---------------------------------------
# SECTION 2: Data Collection
---------------------------------------

### - Definition of search criteria

In [None]:
subreddit_name = 'technology'  # Target subreddit
query = 'ChatGPT'              # Search keyword
limit_posts = 500              # Number of posts to retrieve

## 2.1 Retriving Posts from Reddit (Submissions)

In [None]:
# Initialize storage list
posts_list = []

# Start search and collection process
for submission in reddit.subreddit(subreddit_name).search(query, limit=limit_posts):
    # Extract data from each post
    if submission.score > 10:  # Only posts with 10+ votes
        posts_list.append({
            'post_id': submission.id,                   # Unique post ID
            'title': submission.title,                  # Post title
            'selftext': submission.selftext,            # Main content/text
            'created_utc': submission.created_utc,      # Creation time (Unix timestamp)
            'num_comments': submission.num_comments,    # Number of comments
            'score': submission.score,                  # Net upvotes/downvotes
            'url': submission.url                       # Post link
        })


In [None]:
# Create structured data table
df_posts = pd.DataFrame(posts_list)
if not df_posts.empty:
    # Convert timestamp to readable datetime
    df_posts['date'] = pd.to_datetime(df_posts['created_utc'], unit='s')

print("First 5 collected posts:")
print(df_posts.head())  # Preview initial data
df_posts.to_csv('processed_posts.csv', index=False)

## 2.2 Retriving Comments from these Posts

In [None]:
# Initialize comment storage
comments_list = []
import time

# Loop through each collected post
# Try the first 5 posts only to verify the code
for idx, row in df_posts.head(5).iterrows():
    submission = reddit.submission(id=row['post_id'])
    submission.comments.replace_more(limit=0)

    # Extract all comments in the post
    for comment in submission.comments.list():
        comments_list.append({
            'post_id': row['post_id'],           # Parent post ID
            'comment_id': comment.id,            # Unique comment ID
            'body': comment.body,                # Comment text content
            'created_utc': comment.created_utc,  # Timestamp (Unix format)
            'score': comment.score               # Upvotes/downvotes
        })
    # Wait a while to avoid pressure
    time.sleep(2)

In [None]:
# Create structured data table
if comments_list:
    # Create DataFrame from collected comments
    df_comments = pd.DataFrame(comments_list)

    # Convert timestamp to readable format
    df_comments['date'] = pd.to_datetime(df_comments['created_utc'], unit='s')
    
    # Display sample comments
    print("First 5 collected comments:")
    print(df_comments.head())
else:
    # Handle no-comments scenario
    df_comments = pd.DataFrame(columns=[
        'post_id', 'comment_id', 'body', 'created_utc', 'score', 'date'
    ])
    print("No comments found in collected posts.")

df_comments.to_csv('processed_comments.csv', index=False)

---------------------------------------
# SECTION 3: Data Exploration
---------------------------------------

## 3.1 Overview

### - Overview of the posts

In [None]:
print(df_posts.info())

In [None]:
print(df_posts.describe(include='all'))

### - Overview of the comments

In [None]:
print(df_comments.info())

In [None]:
print(df_comments.describe(include='all'))

## 3.2 Sample Preview

In [None]:
print(df_posts[['title','selftext','date','score']].sample(3))

In [None]:
print(df_comments[['body','date','score']].sample(3))

---------------------------------------
# SECTION 4: Data Cleaning (Preprocessing)
---------------------------------------

## 4.1 Preparing a list of common words

In [None]:
import nltk
from nltk.corpus import stopwords
from wordcloud import STOPWORDS

try:
    # Attempt to use stopwords if they are already installed.
    stop_words = set(stopwords.words('english'))
    print("✅ Stopwords loaded successfully from cache")
    
except LookupError:
    print("⚠️ Stopwords not found. Downloading...")
    # Download using an alternative mirror (server issue solution)
    nltk.download('stopwords', download_dir='/usr/share/nltk_data')
    
    # Retry after loading
    stop_words = set(stopwords.words('english'))
    print("✅ Stopwords downloaded and loaded successfully")

# Merged with WordCloud Stop
custom_stopwords = set(STOPWORDS).union(stop_words)
print(f"🚀 Created custom stopwords with {len(custom_stopwords)} terms")

## 4.2 Definition of the advanced cleaning function

In [None]:
def clean_text(text):

    # Checking for text presence
    if not isinstance(text, str):
        return ""

    # Remove links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove punctuation marks (without removing spaces)
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Convert to lowercase and remove stop words
    text = text.lower()
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]


    return ' '.join(tokens)

## 4.3 Cleaning app

In [None]:
# 1. Application on posts
if not df_posts.empty:
    # Filling the empty values
    df_posts['title'] = df_posts['title'].fillna('')
    df_posts['selftext'] = df_posts['selftext'].fillna('')
    # Cleaning
    df_posts['clean_title'] = df_posts['title'].apply(clean_text)
    df_posts['clean_selftext'] = df_posts['selftext'].apply(clean_text)
    # Merging fields for comprehensive analysis
    df_posts['full_text'] = df_posts['clean_title'] + " " + df_posts['clean_selftext']

In [None]:
# 2. Application on comments
if not df_comments.empty:
    df_comments['body'] = df_comments['body'].fillna('')
    df_comments['clean_body'] = df_comments['body'].apply(clean_text)

---------------------------------------
# SECTION 5: Sentiment Analysis
---------------------------------------

## 5.1 Sentiment analysis function

In [None]:
from textblob import TextBlob

def analyze_sentiment(text, neutral_threshold=0.15):

    try:
        if not text or len(text.strip()) < 3:
            return 0.0, 'Neutral'

        analysis = TextBlob(text)
        polarity = analysis.sentiment.polarity

        if polarity > neutral_threshold:
            sentiment = 'Positive'
        elif polarity < -neutral_threshold:
            sentiment = 'Negative'
        else:
            sentiment = 'Neutral'

        return polarity, sentiment

    except Exception as e:
        print(f"Error analyzing sentiment: {str(e)}")
        return 0.0, 'Neutral'

## 5.2 Analysis application

In [None]:
# 1. Analysis application on posts
if not df_posts.empty:
    # Applying the function and creating two new columns
    df_posts[['post_polarity', 'post_sentiment']] = df_posts['full_text'].apply(
        lambda x: pd.Series(analyze_sentiment(x))
    )

In [None]:
# 2. Analysis application on comments
if not df_comments.empty:
    df_comments[['comment_polarity', 'comment_sentiment']] = df_comments['clean_body'].apply(
        lambda x: pd.Series(analyze_sentiment(x))
    )

## 5.3 Preview the results

In [None]:
if not df_posts.empty:
    print(df_posts[['full_text','post_polarity','post_sentiment']].sample(3))
if not df_comments.empty:
    print(df_comments[['clean_body','comment_polarity','comment_sentiment']].sample(3))


In [None]:
# 1. Random samples of publications
print("="*50)
print("A random sample of sentiment analysis of posts (3 posts):")
print(df_posts[['title', 'full_text', 'post_polarity', 'post_sentiment']].sample(3))
print("="*50)

In [None]:
# 2. Random samples for comments
print("="*50)
print("A random sample of sentiment analysis of comments (5 comments):")
print(df_comments[['body', 'clean_body', 'comment_polarity', 'comment_sentiment']].sample(5))
print("="*50)

---------------------------------------
# SECTION 6: Statistical Analysis
---------------------------------------

## 6.1 Basic Statistics

In [None]:
# Statistical Summary of Emotions

print("\n" + "="*50)
print("Statistical Summary of Emotions:")
print("="*50)

# For posts
if not df_posts.empty:
    post_counts = df_posts['post_sentiment'].value_counts()
    total_posts = len(df_posts)

    print("\n[Posts]")
    print("-"*40)
    print(f"Total Number: {total_posts}")
    print(f"Average Polarity: {df_posts['post_polarity'].mean():.2f}")
    print("\nDistribution:")
    for sentiment in ['Positive', 'Neutral', 'Negative']:
        count = post_counts.get(sentiment, 0)
        pct = count/total_posts*100
        print(f"  {sentiment}: {count} ({pct:.1f}%)")

# For Comments
if not df_comments.empty:
    comment_counts = df_comments['comment_sentiment'].value_counts()
    total_comments = len(df_comments)

    print("\n[Comments]")
    print("-"*40)
    print(f"Total Number: {total_comments}")
    print(f"Average Polarity: {df_comments['comment_polarity'].mean():.2f}")
    print("\nDistribution:")
    for sentiment in ['Positive', 'Neutral', 'Negative']:
        count = comment_counts.get(sentiment, 0)
        pct = count/total_comments*100
        print(f"  {sentiment}: {count} ({pct:.1f}%)")

print("="*50)

## 6.2 Analysis of Extreme Cases

### - Most positive/negative post

In [None]:
# 1. Posts
if not df_posts.empty:
    # Most positive post
    print("\nMost positive posts:")
    most_positive_post = df_posts.loc[df_posts['post_polarity'].idxmax()]
    print(f"Evaluation: {most_positive_post['post_polarity']:.2f}")
    print(f"Link: {most_positive_post['url']}") 
    # Reduce the number of characters
    print(f"Text: {most_positive_post['full_text'][:150]}...")  

    # Most negative post
    print("\nMost negative posts:")
    most_negative_post = df_posts.loc[df_posts['post_polarity'].idxmin()]
    print(f"Evaluation: {most_negative_post['post_polarity']:.2f}")
    print(f"Link: {most_negative_post['url']}")
    print(f"Text: {most_negative_post['full_text'][:150]}...")

### - Most positive/negative comment

In [None]:
# 2. Comments
if not df_comments.empty:
    # Most negative comment
    print("\nMost negative comments:")
    most_negative_comment = df_comments.loc[df_comments['comment_polarity'].idxmin()]
    print(f"Evaluation: {most_negative_comment['comment_polarity']:.2f}")
    print(f"Text: {most_negative_comment['clean_body'][:150]}...")

    # Most positive comment
    print("\nMost positive comments:")
    most_positive_comment = df_comments.loc[df_comments['comment_polarity'].idxmax()]
    print(f"Evaluation: {most_positive_comment['comment_polarity']:.2f}")
    print(f"Text: {most_positive_comment['clean_body'][:150]}...")

print("="*50)

---------------------------------------
# SECTION 7: Basic Data Visualization
---------------------------------------

## 7.1 Sentiment distribution visualization

### - Distribution of posts (Barplot)

In [None]:
if not df_posts.empty:
    plt.figure(figsize=(8,6))
    sns.countplot(data=df_posts, x='post_sentiment', order=['Positive','Neutral','Negative'], hue='post_sentiment', palette='viridis')
    plt.title('Distribution of posts Sentiment')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.savefig('posts_sentiment_dist.png', dpi=300)
    plt.show()

### - Distribution of comments (pie chart)

In [None]:
if not df_comments.empty:
    plt.figure(figsize=(8, 8))

    # Organizing the categories logically
    sentiment_counts = df_comments['comment_sentiment'].value_counts().reindex(['Positive', 'Neutral', 'Negative'])

    #Color coding according to classification
    palette = {'Positive': '#F7CAC9', 'Neutral': '#D8BFD8', 'Negative': '#B8D8D8'}


    # Creating the drawing with added shadows
    plt.pie(sentiment_counts,
            labels=sentiment_counts.index,
            autopct=lambda p: f'{p:.1f}%\n({int(p/100*sentiment_counts.sum())})',
            colors=[palette[x] for x in sentiment_counts.index],
            startangle=90,
            wedgeprops={'edgecolor': 'w', 'linewidth': 2},
            shadow=True)

    plt.title('Distribution of comments Sentiment', fontsize=16, pad=20)
    plt.gca().add_artist(plt.Circle((0,0), 0.7, fc='white'))  # A white circle in the middle
    plt.savefig('comments_sentiment_dist.png', dpi=300, bbox_inches='tight')
    plt.show()

### - Comparing the distribution of sentiments between posts and comments

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

# 1. Distribution of posts Sentiment
sns.countplot(data=df_posts, x='post_sentiment', ax=ax[0], hue='post_sentiment', palette='viridis', order=['Positive', 'Neutral', 'Negative'])
ax[0].set_title('Sentiment Distribution in Posts', fontsize=14)
ax[0].set_xlabel('Sentiment')
ax[0].set_ylabel('Number of Posts')

# 2. Distribution of comments Sentiment
sns.countplot(data=df_comments, x='comment_sentiment', ax=ax[1], hue='comment_sentiment',
              palette='coolwarm', order=['Positive', 'Neutral', 'Negative'])
ax[1].set_title('Sentiment Distribution in Comments', fontsize=14)
ax[1].set_xlabel('Sentiment')
ax[1].set_ylabel('Number of Comments')


plt.tight_layout()
plt.savefig('sentiment_comparison.png', dpi=300)
plt.show()

In [None]:
# Statistical Text Distribution
print("\nDistribution of posts Sentiment:")
print(df_posts['post_sentiment'].value_counts())

print("\nDistribution of comments Sentiment")
print(df_comments['comment_sentiment'].value_counts())

## 7.2 The relationship between sentiment and interaction (Engagement)

In [None]:
plt.figure(figsize=(12, 6))

# 1. Posts
plt.subplot(1, 2, 1)
sns.boxplot(data=df_posts, x='post_sentiment', y='score', hue='post_sentiment',
            palette='viridis', order=['Positive', 'Neutral', 'Negative'])
plt.title('Emotions vs Votes (Posts)', fontsize=12)
plt.xlabel('Sentiment')
plt.ylabel('Score (log scale)')
plt.yscale('log')

# 2. Comments
plt.subplot(1, 2, 2)
sns.boxplot(data=df_comments, x='comment_sentiment', y='score', hue='comment_sentiment',
            palette='coolwarm', order=['Positive', 'Neutral', 'Negative'])
plt.title('Emotions vs Votes (Comments)', fontsize=12)
plt.xlabel('Sentiment')
plt.ylabel('')
plt.yscale('log')

plt.tight_layout()
plt.savefig('sentiment_vs_engagement.png', dpi=300)
plt.show()

### 🔍 Data Reveals:
- Posts thrive on positivity ☀️

- Comments ignite through controversy 🔥

- Neutral content drowns in silence 🔇

### Scientific Insights:

1.   Why Positivity Wins:
 - Positive posts drive sharing ("See how this changed my life!")
 - Negative content sparks empathy but rarely drives engagement


2.   The Controversy Paradox:

 - High engagement comes from "anger responses" not agreement
 - Algorithms favor contentious content


3.  Outliers Decoded:

 *   Exceptional positive posts: Innovative solutions, success stories
 *   Viral negative posts: Collective scandals, public crises


### Strategic Recommendations:
"Evidence-based roadmap:


1.   Invest 70% in hope-driven content (Build your engagement empire)
2.   Allocate 25% to calculated controversy (But beware: controversy is a double-edged sword!
Negative engagement burns bright but consumes its creator)
3. Avoid neutrality except when strategically essential
Safe content collects digital dust"

---------------------------------------
# SECTION 8: Temporal Analysis of Sentiment
---------------------------------------

## 8.1 The Monthly Trend of Posts Sentiments

In [None]:
# Create the monthly column if it does not exist.
if 'month' not in df_posts.columns:
    df_posts['month'] = df_posts['date'].dt.to_period('M')

# Monthly average calculation
monthly_sentiment = df_posts.groupby('month')['post_polarity'].mean()

# The graph
monthly_sentiment.plot(kind='line', marker='o', color='blue')
plt.axhline(y=0, color='red', linestyle='--', alpha=0.7)
plt.title('Monthly trend of average post sentiment')
plt.ylabel('Average Polarity')
plt.xlabel('Month')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('Monthly trend of average post sentiment.png', dpi=300)
plt.show()


### Key Findings:
1. Launch Peak (Jan 2023):

  - Record high sentiment (0.35) during product launch

2. Reality Drop (May 2023):

  - 85% decline as users experienced limitations

3. Trust Crisis (Aug 2024 - Present):

  - Historic low (-0.15) in August 2024

  - Consistent negative trend since late 2024

### Action Required:

- Immediate forensic analysis of August 2024 events

- Investigation into persistent post-Nov 2024 decline

- Cross-reference with product usage metrics

## 8.2 Daily Sentiment Trend (Posts + Comments)

In [None]:
plt.figure(figsize=(14, 8))

# Data processing and aggregation
df_posts['date_only'] = df_posts['date'].dt.date
df_comments['date_only'] = df_comments['date'].dt.date

# Daily average calculation
daily_posts = df_posts.resample('D', on='date')['post_polarity'].mean()
daily_comments = df_comments.resample('D', on='date')['comment_polarity'].mean()

# The graph
if not df_posts.empty:
    plt.plot(daily_posts, marker='o', markersize=5, color='green', label='posts', linewidth=1.5, alpha=0.8)

if not df_comments.empty:
    plt.plot(daily_comments, marker='s', markersize=4, color='orange', label='comments', linewidth=1.5, alpha=0.8)

# Add explanatory elements
plt.axhline(y=0, color='r', linestyle='--', alpha=0.7)
plt.title('Time trend of daily sentiment', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Average Polarity', fontsize=12)
plt.legend(title='Content type')
plt.grid(alpha=0.2)
plt.xticks(rotation=45)

# Save the result
plt.savefig('sentiment_trend.png', dpi=300, bbox_inches='tight')
plt.show()

## 8.3 The temporal trend of the comments (percentages)

In [None]:
if not df_comments.empty:
    # A copy of the data to protect the original
    temp_df = df_comments.copy()

    # Extract the date
    temp_df['date_only'] = temp_df['date'].dt.date

    # 1. Data collection
    sentiment_counts = temp_df.groupby(['date_only', 'comment_sentiment']).size().unstack(fill_value=0)

    # 2. Calculating percentages
    sentiment_pct = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0) * 100

    # 3. The graph
    palette = {'Positive':'green', 'Neutral':'gray', 'Negative':'red'}
    plt.figure(figsize=(12,6))
    for sentiment in ['Positive', 'Neutral', 'Negative']:
        sns.lineplot(x=sentiment_pct.index, y=sentiment_pct[sentiment],
                     label=sentiment, marker='o', markersize=4, color=palette[sentiment])

    plt.title('Daily comment sentiment evolution (%)', fontsize=14)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Percentage', fontsize=12)
    plt.legend(title='Type of feelings')
    plt.grid(alpha=0.2)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('Daily_comment_sentiment_evolution.png', dpi=300, bbox_inches='tight')
    plt.show()

#### **Note**: The analysis is based on a sample of only the first 5 posts - the numbers may change with complete data but the general patterns remain significant.

### Analytical Conclusion: Three Critical Shifts

1. The April Shock:

 - Sharp drop in positivity (60% → 40%) and surge in negativity (20% → 45%) between Apr 24-26

 - Indicates an exceptional event that damaged user trust

2. The May Silence:

 - Neutral sentiment peaked at 75% of interactions

 - Warning sign of disengagement or unexpressed disappointment

3. The Fragile June Recovery:

 - Positivity rebounded to 50% primarily at neutral's expense

 - Negativity decreased only 5% - Imbalanced improvement

### Action Recommendations:

- 🔍 Investigate root causes of Apr 25 shock (updates? external events?)

- 📊 Study neutral sentiment surge on May 15 (lost engagement?)

- 💡 Assess sustainability of Jun 5 recovery (permanent or temporary?)
"Sudden data anomalies are alarms demanding attention"

---------------------------------------
# SECTION 9: Advanced Text Analysis
---------------------------------------

## 9.1 Wordcloud by category of Sentiment

In [None]:
# 1. Preparing basic stop words
custom_stopwords = set(STOPWORDS)
custom_stopwords.update(['the', 'and', 'to', 'of', 'a', 'in', 'is', 'it', 'that', 'this', 'was',
                         'chatgpt', 'ai', 'model', 'prompt', 'gpt'])

def create_wordcloud(texts, title, sentiment_type, source):

    # Checking for data availability
    if texts.empty:
        print(f"No texts available for {title}")
        return

    # Text cleaning
    def clean_text(text):
        if not isinstance(text, str):
            return ""
        # Remove links, tags, etc.
        words = text.split()
        cleaned = [word for word in words
                  if not word.startswith('http')
                  and not word.startswith('@')
                  and not word.startswith('#')
                  and word != 'RT']
        return " ".join(cleaned)

    # Cleaning app
    cleaned_texts = texts.apply(clean_text)

    # Text merging
    combined_text = " ".join(cleaned_texts)

    # Color selection
    color_map = {
        'positive': 'Greens',
        'negative': 'Reds',
        'neutral': 'Oranges'
    }

    # Creating a word cloud
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap=color_map[sentiment_type],
        stopwords=custom_stopwords,
        max_words=100
    ).generate(combined_text)

    # Show the chart
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.tight_layout()


    # Automatically save the image in the results folder.
    filename = f"{source}_{sentiment_type}_wordcloud.png"
    plt.savefig(filename, dpi=300, bbox_inches='tight')

    plt.show()


## 9.2 Application on data (posts)

### - WordCloud of Positive Post

In [None]:
# Positive post words
if not df_posts.empty:
    positive_posts = df_posts[df_posts['post_sentiment'] == 'Positive']['full_text']
    create_wordcloud(positive_posts,
                    "Top Words in Positive Posts",
                    'positive', 'Posts')

### -  WordCloud for Negative Posts

In [None]:
# Words of negative posts
if not df_posts.empty:
    negative_posts = df_posts[df_posts['post_sentiment'] == 'Negative']['full_text']
    create_wordcloud(negative_posts,
                    "Top Words in Negative Posts",
                    'negative', 'Posts')

## 9.3 Application on data (Comments)

### - WordCloud of Positive Comment

In [None]:
# Positive comment words
if not df_comments.empty:
    positive_comments = df_comments[df_comments['comment_sentiment'] == 'Positive']['clean_body']
    create_wordcloud(positive_comments,
                    "Top Words in Positive Comments",
                    'positive', 'Comments')

### - WordCloud of Negative Comment

In [None]:
# Words of negative comments
if not df_comments.empty:
    negative_comments = df_comments[df_comments['comment_sentiment'] == 'Negative']['clean_body']
    create_wordcloud(negative_comments,
                    "Top Words in Negative Comments",
                    'negative', 'Comments')

---------------------------------------
# SECTION 10: Conclusions & Recommendations
---------------------------------------


### 🔍 Conclusions  

- **Posts sentiment**: Predominantly positive (~60%) with neutral as secondary (~35%).  
- **Comments sentiment**: Dominated by negativity (57.7%) despite post optimism.  
- **Critical fluctuations**: Sharp sentiment shifts coincided with major updates (e.g., 20% drop in April).  
- **Key themes**:  
  - ✅ Praise: "accuracy," "speed," "creativity"  
  - ❌ Criticism: "errors," "limitations," "privacy concerns"  

### 🚀 Recommendations  

1. **Feature enhancement**  
   - Prioritize "accuracy" and "speed" improvements  
   - Address: privacy issues, academic limitations, response errors  

2. **Sentiment monitoring**  
   - Track sentiment pre-/post-updates  
   - Alert team for >15% fluctuations  

3. **Neutral-to-positive conversion**  
   - Launch prompts: *"Share a creative use case you loved!"*  
   - Target neutrals with topic-based content (e.g., "research tips")  

4. **Data integrity**  
   - Expand analysis to 50+ posts  
   - Use stratified sampling  

> ⚠️ **Methodological Note**:  
> Analysis based on prototype sample (first 5 posts' comments).  
> Patterns are indicative; absolute values may shift with full data.  

---------------------------------------
# SECTION 11: Save the Results & Outputs
---------------------------------------


In [None]:
import os
import shutil

# Specify the project folder by going one step back from within the notebooks folder.
project_folder = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Required memorization paths
processed_data_path = os.path.join(project_folder, 'processed_data')
outputs_path = os.path.join(project_folder, 'outputs')

# Make sure the required folders exist; if they do not exist, they will be created.
for path in [processed_data_path, outputs_path]:
    if not os.path.exists(path):
        os.makedirs(path)

# Transfer CSV files to the processed_data folder
if os.path.exists('processed_posts.csv'):
    shutil.move('processed_posts.csv', os.path.join(processed_data_path, 'processed_posts.csv'))

if os.path.exists('processed_comments.csv'):
    shutil.move('processed_comments.csv', os.path.join(processed_data_path, 'processed_comments.csv'))

# List of images to transfer to the outputs folder
image_files = [
    'posts_sentiment_dist.png',
    'comments_sentiment_dist.png',
    'sentiment_comparison.png',
    'sentiment_vs_engagement.png',
    'Monthly trend of average post sentiment.png',
    'sentiment_trend.png',
    'Daily_comment_sentiment_evolution.png',
     # Word clouds
    'Posts_positive_wordcloud.png',
    'Posts_negative_wordcloud.png',
    'Comments_positive_wordcloud.png',
    'Comments_negative_wordcloud.png'
]

for img in image_files:
    if os.path.exists(img):
        shutil.move(img, os.path.join(outputs_path, img))

print("✅ CSV files and images moved successfully to the main project folder.")


### ===============================================================================================