In [2]:
import csv
import os
import random
from datetime import datetime, timedelta

# Create input directory if it doesn't exist
os.makedirs("input", exist_ok=True)

# Sample users
user_data = []
usernames = [
    "@techie42", "@critic99", "@daily_vibes", "@designer_dan", "@rage_user",
    "@meme_lord", "@social_queen", "@calm_mind", "@pixel_pusher", "@stream_bot"
]
age_groups = ["Teen", "Adult", "Senior"]
countries = ["US", "UK", "Canada", "India", "Germany", "Brazil"]
verified_status = [True, False]

for user_id in range(1, 9):
    user = {
        "UserID": user_id,
        "Username": usernames[user_id - 1],
        "AgeGroup": random.choice(age_groups),
        "Country": random.choice(countries),
        "Verified": random.choice(verified_status)
    }
    user_data.append(user)

# Write users.csv
with open("input/users.csv", mode="w", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=user_data[0].keys())
    writer.writeheader()
    writer.writerows(user_data)

# Sample posts
hashtags_pool = ["#tech", "#fail", "#design", "#UX", "#cleanUI", "#mood", "#bug", "#love", "#social", "#AI"]
contents = [
    "Loving the new update!",
    "This app keeps crashing. So annoying.",
    "Just another day...",
    "Absolutely love the UX!",
    "Worst experience ever.",
    "Such a smooth interface!",
    "Great performance on mobile.",
    "Can’t stop using it!",
    "Needs dark mode ASAP!",
    "I’m impressed with the speed."
]

posts_data = []
base_time = datetime.now()

for post_id in range(101, 201):
    uid = random.randint(1, 10)
    timestamp = (base_time - timedelta(hours=random.randint(0, 240))).strftime("%Y-%m-%d %H:%M:%S")
    content = random.choice(contents)
    likes = random.randint(0, 150)
    retweets = random.randint(0, 50)
    sentiment = round(random.uniform(-1, 1), 2)
    hashtags = ",".join(random.sample(hashtags_pool, random.randint(1, 3)))

    post = {
        "PostID": post_id,
        "UserID": uid,
        "Content": content,
        "Timestamp": timestamp,
        "Likes": likes,
        "Retweets": retweets,
        "Hashtags": hashtags,
        "SentimentScore": sentiment
    }
    posts_data.append(post)

# Write posts.csv
with open("input/posts.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=posts_data[0].keys())
    writer.writeheader()
    writer.writerows(posts_data)

print("✅ Dataset generation complete: 'users.csv' and 'posts.csv' created in /input/")

✅ Dataset generation complete: 'users.csv' and 'posts.csv' created in /input/


In [3]:
import pandas as pd

# Load datasets
posts_df = pd.read_csv("input/posts.csv")

# Split Hashtags and expand into individual tags
hashtags = posts_df['Hashtags'].str.split(',', expand=True).stack()

# Count frequency of each hashtag
hashtag_count = hashtags.value_counts().reset_index()
hashtag_count.columns = ['Hashtag', 'Count']

# Get the top 10 most used hashtags
top_hashtags = hashtag_count.head(10)
print(top_hashtags)


    Hashtag  Count
0   #social     26
1       #UX     24
2  #cleanUI     23
3     #mood     23
4      #bug     20
5     #tech     20
6     #love     20
7       #AI     18
8   #design     16
9     #fail     12


In [4]:
# Load datasets
posts_df = pd.read_csv("input/posts.csv")
users_df = pd.read_csv("input/users.csv")

# Merge posts with users on UserID
merged_df = pd.merge(posts_df, users_df, on="UserID")

# Group by AgeGroup and calculate average likes and retweets
age_group_engagement = merged_df.groupby('AgeGroup').agg(
    avg_likes=('Likes', 'mean'),
    avg_retweets=('Retweets', 'mean')
).reset_index()

# Sort by average engagement (likes + retweets)
age_group_engagement['total_engagement'] = age_group_engagement['avg_likes'] + age_group_engagement['avg_retweets']
age_group_engagement = age_group_engagement.sort_values('total_engagement', ascending=False)

print(age_group_engagement[['AgeGroup', 'avg_likes', 'avg_retweets']])


  AgeGroup  avg_likes  avg_retweets
1   Senior  75.093023     29.255814
2     Teen  67.958333     26.250000
0    Adult  66.000000     25.333333


In [5]:
# Load datasets
posts_df = pd.read_csv("input/posts.csv")

# Categorize Sentiment
def categorize_sentiment(score):
    if score > 0.2:
        return 'Positive'
    elif score < -0.2:
        return 'Negative'
    else:
        return 'Neutral'

posts_df['SentimentCategory'] = posts_df['SentimentScore'].apply(categorize_sentiment)

# Group by sentiment category and calculate average likes and retweets
sentiment_engagement = posts_df.groupby('SentimentCategory').agg(
    avg_likes=('Likes', 'mean'),
    avg_retweets=('Retweets', 'mean')
).reset_index()

print(sentiment_engagement)


  SentimentCategory  avg_likes  avg_retweets
0          Negative  64.617647     30.705882
1           Neutral  76.187500     23.500000
2          Positive  71.140000     28.520000


In [7]:
# Load datasets
posts_df = pd.read_csv("input/posts.csv")
users_df = pd.read_csv("input/users.csv")

# Merge posts with users on UserID
merged_df = pd.merge(posts_df, users_df, on="UserID")

# Filter verified users
verified_users = merged_df[merged_df['Verified'] == True].copy()  # Create a copy to avoid SettingWithCopyWarning

# Calculate total reach as Likes + Retweets using .loc[]
verified_users.loc[:, 'TotalReach'] = verified_users['Likes'] + verified_users['Retweets']

# Group by Username and sum the total reach
user_reach = verified_users.groupby('Username').agg(total_reach=('TotalReach', 'sum')).reset_index()

# Get top 5 users by total reach
top_verified_users = user_reach.sort_values('total_reach', ascending=False).head(5)

print(top_verified_users)


        Username  total_reach
2  @social_queen          845
3      @techie42          822
1     @rage_user          577
0  @designer_dan          566
