# 05 Risk Scoring





In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("../data/processed/text_with_narratives.csv")
df.head()


Unnamed: 0,text,clean_text,word_count,topic,narrative
0,Vaccines cause serious side effects according ...,vaccine cause serious side effect accord report,7,0,"cause, cause effect, vaccine cause"
1,5G towers are spreading harmful radiation.,g tower spread harmful radiation,5,0,"cause, cause effect, vaccine cause"
2,Government confirms vaccine safety after trials.,government confirm vaccine safety trial,5,1,"vaccine, medium claim, confirm vaccine"
3,Social media claims about microchips in vaccin...,social medium claim microchip vaccine spread r...,7,1,"vaccine, medium claim, confirm vaccine"


In [3]:
assert "topic" in df.columns
assert "clean_text" in df.columns


In [4]:
narrative_size = df["topic"].value_counts().to_dict()
df["narrative_size"] = df["topic"].map(narrative_size)
df.head()


Unnamed: 0,text,clean_text,word_count,topic,narrative,narrative_size
0,Vaccines cause serious side effects according ...,vaccine cause serious side effect accord report,7,0,"cause, cause effect, vaccine cause",2
1,5G towers are spreading harmful radiation.,g tower spread harmful radiation,5,0,"cause, cause effect, vaccine cause",2
2,Government confirms vaccine safety after trials.,government confirm vaccine safety trial,5,1,"vaccine, medium claim, confirm vaccine",2
3,Social media claims about microchips in vaccin...,social medium claim microchip vaccine spread r...,7,1,"vaccine, medium claim, confirm vaccine",2


In [13]:
np.random.seed(42)
df["misinformation_prob"] = np.random.uniform(0.4, 1.0, size=len(df))


In [14]:
min_size = df["narrative_size"].min()
max_size = df["narrative_size"].max()

if min_size == max_size:
    # All narratives have same size → equal spread risk
    df["norm_narrative_size"] = 0.5
else:
    df["norm_narrative_size"] = (
        df["narrative_size"] - min_size
    ) / (max_size - min_size)


In [15]:
df["risk_score"] = (
    0.6 * df["misinformation_prob"]
    + 0.4 * df["norm_narrative_size"]
)


Risk Score =
  (0.6 × misinformation probability)
+ (0.4 × narrative spread)


In [16]:
def risk_level(score):
    if score >= 0.75:
        return "High Risk"
    elif score >= 0.5:
        return "Medium Risk"
    else:
        return "Low Risk"

df["risk_level"] = df["risk_score"].apply(risk_level)


In [17]:
df[df["risk_level"] == "High Risk"][[
    "clean_text", "topic", "risk_score"
]].head()


Unnamed: 0,clean_text,topic,risk_score
1,g tower spread harmful radiation,0,0.782257


In [18]:
narrative_risk = (
    df.groupby("topic")
      .agg(
          avg_risk_score=("risk_score", "mean"),
          narrative_size=("clean_text", "count")
      )
      .reset_index()
      .sort_values(by="avg_risk_score", ascending=False)
)

narrative_risk.head()


Unnamed: 0,topic,avg_risk_score,narrative_size
1,1,0.679517,2
0,0,0.678546,2


In [19]:
df.to_csv("../data/processed/text_with_risk_scores.csv", index=False)
narrative_risk.to_csv("../data/processed/narrative_risk_summary.csv", index=False)

print("Risk scoring results saved successfully.")


Risk scoring results saved successfully.
