In [16]:
import pandas as pd
df = pd.read_csv("../data/sentimentdataset_cleaned.csv")
df.head()

Unnamed: 0,Id,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,...,Day,Hour,text_len,word_count,hashtag_count,emoji_count,SentimentSimple,Sentiment_norm,TextLength,LengthGroup
0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,Usa,...,15,12,52,7,2,1,Positive,Positive,52,30-59
1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,...,15,8,52,5,2,1,Negative,Negative,52,30-59
2,2,Just finished an amazing workout! ðŸ’ª ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,Usa,...,15,15,51,6,2,2,Positive,Positive,51,30-59
3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,United Kingdom,...,15,18,52,6,2,1,Positive,Positive,52,30-59
4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,...,15,19,52,8,2,1,Neutral,Neutral,52,30-59


debate_score=z(rt_like_ratio)+z(word_count)+z(sentiment_std)

We built a debate index for each platform using three metrics from our earlier options: the average retweet-to-like 
ratio, the average word count per post, and the standard deviation of sentiment. We standardized these metrics and added 
them up. The platform with the highest debate index is classified as the debate platform, and the other two as 
show-and-tell platforms.


In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 0. Map SentimentSimple to numeric sentiment_score  (-1,0,1)
sentiment_map = {
    "Negative": -1,
    "Neutral": 0,
    "Positive": 1
}
df["sentiment_score"] = df["SentimentSimple"].map(sentiment_map)

print("Sentiment_score value counts:")
print(df["sentiment_score"].value_counts(dropna=False))
print()

# 1. Aggregate 3 metrics by platform
by_platform = (
    df.groupby("Platform")
      .agg(
          mean_rt_like_ratio=("rt_like_ratio", "mean"),   # Option 1
          mean_word_count=("word_count", "mean"),         # Option 2
          sentiment_std=("sentiment_score", "std")        # Option 3
      )
)

print("Raw platform stats (before standardization):")
print(by_platform, "\n")

# 2. Standardize the three metrics
scaler = StandardScaler()
Z = scaler.fit_transform(
    by_platform[["mean_rt_like_ratio", "mean_word_count", "sentiment_std"]]
)
by_platform[["z_rt", "z_wc", "z_sentstd"]] = Z

# 3. Debate index = sum of z-scores
by_platform["debate_score"] = by_platform[
    ["z_rt", "z_wc", "z_sentstd"]
].sum(axis=1)

# 4. Classify: highest debate_score = Debate, others = Show-tell
debate_platform = by_platform["debate_score"].idxmax()
print(f"Platform with highest debate_score: {debate_platform}\n")

by_platform["platform_type_rule"] = np.where(
    by_platform.index == debate_platform,
    "Debate",
    "Show-tell"
)

print("Final platform classification based on debate_score:")
print(by_platform[[
    "mean_rt_like_ratio", "mean_word_count", "sentiment_std",
    "debate_score", "platform_type_rule"
]], "\n")

# 5. Merge back to df so each post has the platform type
df = df.merge(
    by_platform[["platform_type_rule"]],
    left_on="Platform",
    right_index=True,
    how="left"
)

df.rename(columns={"platform_type_rule": "platform_type_debateIndex"}, inplace=True)
df["is_debate_platform_debateIndex"] = (
    df["platform_type_debateIndex"] == "Debate"
).astype(int)

# 6. Print final result
print("Platform type (one row per platform):")
print(df[["Platform", "platform_type_debateIndex"]].drop_duplicates(), "\n")

print("Head of df with new platform type column:")
print(df[["Id", "Platform", "rt_like_ratio",
          "platform_type_debateIndex", "is_debate_platform_debateIndex"]].head())


Sentiment_score value counts:
sentiment_score
 1    336
-1    336
 0     60
Name: count, dtype: int64

Raw platform stats (before standardization):
           mean_rt_like_ratio  mean_word_count  sentiment_std
Platform                                                     
Facebook             0.487776        13.008658       0.956144
Instagram            0.488736        13.325581       0.958242
Twitter              0.488900        13.267490       0.963169 

Platform with highest debate_score: Twitter

Final platform classification based on debate_score:
           mean_rt_like_ratio  mean_word_count  sentiment_std  debate_score  \
Platform                                                                      
Facebook             0.487776        13.008658       0.956144     -3.827127   
Instagram            0.488736        13.325581       0.958242      1.121942   
Twitter              0.488900        13.267490       0.963169      2.705184   

          platform_type_rule  
Platform       