# Dimension Validation

Goal: For each dimension, take words that aligns with and oppose the dimension. Collect all the subreddits that use those word in the comments and take the 50 subreddits that use those words the most. Hand label the subreddits as pro, against or unclear for that dimension and project them onto the dimension. See if the political dimension picks up on the differences.

* Ideology Words:
    * Left Words: progressive, socialism
    * Right Words: conservative, libtard (gotta pick up those alt right ones), based
* Religious: 
    * Religious: bible, jesus, pray, church, mosque
    * Secular: atheism
* Birth Control Words: abortion, feminism, feminazi, "birth control"
* Gun Control Words: 
    * More Regulaton: "gun control", "gun nuts", "gun regulation"
    * Less Regulation: "gun rights", "second amendment", "2 amendment"
* Age Words: 
    * Older: career, family, retirement, mortgage
    * Younger: "high school", homework, internship, crush
* Globalism Words: "free trade", "globalism", neoliberal, Friedman
* War Words: war, troops, veterans, 9/11, "drone strike", "war on terror"
* Gender Words: mom, dad, "make up"


In [1]:
import findspark
findspark.init("/h/224/cameron/spark-3.0.0-preview2-bin-hadoop2.7")
from utils import load_embedding, parse_tup, cos_sim, cos_dist
from kaleido.scopes.plotly import PlotlyScope
from pyspark.sql import SparkSession
from tqdm.auto import tqdm
from datetime import datetime
import plotly.express as px
import plotly.express as go
import pandas as pd
import numpy as np
scope = PlotlyScope()
left_candidates = ["JoeBiden","SandersForPresident","BaemyKlobaechar","ElizabethWarren","Pete_Buttigieg","YangForPresidentHQ"]
mapping = {}
for a,b in zip(left_candidates,px.colors.qualitative.Plotly):
    mapping[a] = b

In [2]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.getConf().getAll()
spark.catalog.clearCache()
# Load the Parquet data
comments = spark.read.load("/comments_2019.parquet").fillna("")
subreddits = spark.read.load("dataframes/subreddits.parquet")
comments = comments.join(subreddits, ['subreddit'], 'leftsemi')
subreddits = subreddits.selectExpr("subreddit as subreddit","count as totalCount")
cols = ['author','subreddit','body','created_utc']
comments = comments.select(*cols)
comments.printSchema()

root
 |-- author: string (nullable = false)
 |-- subreddit: string (nullable = false)
 |-- body: string (nullable = false)
 |-- created_utc: integer (nullable = true)



## Find Top 30

Top 30 subreddits that use representative words with the highest proportion

In [33]:
import pyspark.sql.functions as f

dim_names = ["Ideology","Religiosity","Birth Control","Gun Control","Age","Trade","War","Gender"]
word_pairs = [["progressive","conservative","libtard","socialism","capitalism","based"],
              ["bible", "jesus", "atheism", "pray", "church", "mosque"],
              ["abortion", "feminism", "feminazi", "birth control"],
              ["gun control", "gun rights", "second amendment", "2 amendment"],
              ["career", "family", "retirement", "mortgage", "high school", "homework", "internship","crush"],
              ["free trade", "globalism","neoliberal","friedman"],
              ["war", "troops", "veterans", "9/11", "drone strike","war on terror"],
              ["mom", "dad", "make up"]]

try:
    counts_df = pd.read_csv("external_datasets/validation/dimension_validation_2.csv",index_col=0)
except Exception as e:
    print(e)
    counts_df = pd.DataFrame(columns=["subreddit","count","proportion","dim","label","alignment"])
    counts_df.to_csv('external_datasets/validation/dimension_validation_2.csv')



for name,words in tqdm(list(zip(dim_names,word_pairs))):
    if not name in list(counts_df["dim"]):
        print(name,words)
        top_30 = comments[comments['body'].rlike('|'.join(words))].groupBy("subreddit").count()
        top_30 = top_30.join(subreddits, on=['subreddit'], how='left')
        top_30 = top_30.withColumn("proportion", (f.col("count") / f.col("totalCount")))
        top_30_df = top_30.orderBy('proportion', ascending=False).limit(30).select(*["subreddit","count","proportion"]).toPandas()
        top_30.unpersist()
        top_30_df["dim"],top_30_df["label"],top_30_df["alignment"] = name,"NA","NA"
        top_30_df = top_30_df.reset_index()
        top_30_df.to_csv('external_datasets/validation/dimension_validation_2.csv',index=False,mode='a', header=False)
        
counts_df = pd.read_csv("external_datasets/validation/dimension_validation_2.csv",index_col=0)
counts_df


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




Unnamed: 0,subreddit,count,proportion,dim,label,alignment
0,AquaSama,16607,0.540153,Religiosity,,
1,PrayerRequests,4812,0.301202,Religiosity,Religious,Aligned
2,Anglicanism,6392,0.223169,Religiosity,Religious,Aligned
3,mormon,26108,0.215051,Religiosity,Religious,Aligned
4,RPChristians,2151,0.206311,Religiosity,Religious,Aligned
...,...,...,...,...,...,...
25,BrasilSimulator,12000,0.169456,Gender,,
26,portugal,88447,0.167543,Gender,,
27,brasilivre,94184,0.159148,Gender,,
28,rwbyRP,4085,0.158260,Gender,,


## Label Dimensions

Done in vscode....

## Dimension Validation

In [4]:
embedding = load_embedding("/h/224/cameron/Political-Subreddit-Embedding/trained_embeddings/vecs_0.0028_15.0.txt",split=False)


In [5]:
from dataclasses import dataclass

@dataclass
class Dimension:
    name: str
    subreddits: list
    tags: tuple
        
def calculate_dim(dimensions,e,return_stats=False):
    """
        Takes in a list of tuples and calcualtes the dimension from ap -> a. IE if we have the subreddits AskMen (a) and
        AskWomen (ap). The "masculine" vector transformation is a - ap.
        Returns the average transformation and a tuple containing the mean and std deviation similarity between the individual 
        transformations.
    """
    dimensions = dimensions if type(dimensions) is list else [dimensions]
    calc_trans = lambda vec_a,vec_ap : vec_a - vec_ap 
    transformations = np.array([calc_trans(e.loc[a],e.loc[ap]) for (a,ap) in dimensions])
    transformation = transformations.flatten() if transformations.shape[0] == 1 else np.mean(transformations,axis=0)
    if return_stats:
        sims = [cos_sim(t1,t2)  for t1 in list(transformations) for t2 in list(transformations) if not np.all(t1==t2)]
        return transformation,(np.mean(sims),np.std(sims))
    return transformation

project_dim = lambda dim,sub,e : cos_sim(dim,e.loc[sub])

## Validate Dimensions

In [6]:
religious_subs = [("Christianity","exchristian"),
                  ("TraditionalCatholics","excatholic"),
                  ("lds","exmormon"),("mormon","exmormon"),("islam","exmuslim")]

ideological_subs =[("Conservative","progressive"),
                   ("Republican","democrats"),
                   ("conservatives","SocialDemocracy"),
                   ("TheNewRight","WeAreNotAsking"),
                   ("neoliberal","dsa"),
                   ("Capitalism","capitalism_in_decay"),
                   ("walkaway","LateStageImperialism")]

bc_subs = [("prolife","prochoice"),
           ("prolife","birthcontrol")]

gc_subs= [("Firearms","GunsAreCool"),("progun","GunsAreCool")]
age_subs = [("RedditForGrownups","teenagers"),
            ("RedditForGrownups","teenagersnew")]

war_subs = [("CredibleDefense","EndlessWar"),
            ("WarCollege","EndlessWar"),
            ("Intelligence","EndlessWar")]

glob_subs = [("neoliberal","EnoughLibertarianSpam"),
             ("Economics","capitalism_in_decay"),
             ("Libertarian","EnoughLibertarianSpam")
            ]

gender_subs = [("daddit","Mommit"),
               ("AskMen","AskWomen")]


religious_dim = Dimension("Religiosity",religious_subs,("Religious","Secular"))
ideological_dim = Dimension("Ideology",ideological_subs,("Conservative","Progressive"))
bc_dim = Dimension("Birth Control",bc_subs,("Pro-Life","Pro-Choice"))
gc_dim = Dimension("Gun Control",gc_subs,("Less Regulation","More Regulation"))
age_dim = Dimension("Age",age_subs,("Older","Younger"))
war_dim = Dimension("War",war_subs,("Hawkish","Pacifistic"))
glob_dim = Dimension("Trade",glob_subs,("Pro-Trade","Anti-Trade"))
gender_dim = Dimension("Gender",gender_subs,("Masculine","Feminine"))


dim_dict = {
    "Ideology": ideological_dim,
    "Religiosity": religious_dim,
    "Birth Control": bc_dim, 
    "Gun Control": gc_dim,
    "Age": age_dim,
    "Trade": glob_dim,
    "War": war_dim,
    "Gender": gender_dim
}

In [36]:
def get_projection(**kwargs):
    sub = kwargs["subreddit"]
    dim = dim_dict[kwargs["dim"]]
    aligned = "Aligned" if kwargs["label"] == dim.tags[0] else "Opposed" if kwargs["label"] == dim.tags[1] else "NA"
    dim_vec = calculate_dim(dim.subreddits,embedding)
    return project_dim(dim_vec,sub,embedding),aligned
counts_df[["projection","aligned"]] = counts_df.apply(lambda x : get_projection(**x),axis=1,result_type="expand")
counts_df = counts_df.fillna("NA").sort_values("dim",ascending=False)
counts_df["size"] = 10
counts_df.head()

Unnamed: 0,subreddit,count,proportion,dim,label,Alignment,projection,aligned,size,symbol
7,unknownvideos,4219,0.310243,War,,,-0.111036,,10,square
14,resumes,17206,0.254448,War,,,0.112801,,10,square
26,dwarffortress,21998,0.219831,War,,,0.094139,,10,square
25,PublishProtocol,3380,0.220325,War,,,-0.078089,,10,square
24,WarCollege,6204,0.22948,War,Hawkish,Aligned,0.411829,Aligned,10,square


In [50]:
counts_df.rename({"alignment": "Alignment"},axis=1,inplace=True)
fig = px.scatter(counts_df,
                 x="projection",
                 color="Alignment",
                 size="size",
                 hover_data=["subreddit","count","label"],
                 y="dim",
                 labels=dict(dim="",projection="Projection"))
x = abs(max(counts_df["projection"].max(),counts_df["projection"].min().min(),key=abs))*1.2
annotations = []

for dim_info in dim_dict.values():
    print(dim_info.name)
    annotations.append(dict(
            x=x,
            y=dim_info.name,
            xref="x",
            yref="y",
            text="<b>{}</b>".format(dim_info.tags[0]),
            showarrow=False,
            font=dict(size=16),
            ax=-40,
            ay=-30
        ))    
    annotations.append(dict(
            x=-x,
            y=dim_info.name,
            xref="x",
            yref="y",
            text=f"<b>{dim_info.tags[1]}</b>",
            showarrow=False,
            font=dict(size=16),
            ax=-40,
            ay=-50
        ))
args = {
    "font":{"size": 23},
    "height": 700,
    "width": 1000,    
    "template":"simple_white",
    "yaxis_type": 'category',
    "yaxis":{"showline": False},
    "annotations":annotations,
    "legend": {"orientation":"h",
              "yanchor":"bottom",
                "y":1.07,
                "xanchor":"right",
                "x":1}
}
    
fig.update_layout(**args)
with open("visualizations/political_dimensions/dimension_validation.pdf", "wb") as f:
    f.write(scope.transform(fig, format="pdf"))
fig.write_html("visualizations/political_dimensions/dimension_validation.html")
fig.show()


Ideology
Religiosity
Birth Control
Gun Control
Age
Trade
War
Gender


In [57]:
# print(counts_df.groupby(["dim","label"])[["dim","label","projection"]].mean()#.to_latex())
summary = counts_df.groupby(["dim","alignment"]).mean().reset_index().pivot(index="dim",
                                                                  columns="Alignment", 
                                                                  values="projection")
summary.loc["Mean"] = summary.mean()

format_x = lambda x : f"{x:.3f}"
print(summary.to_latex(caption="Dimension Validation Against Subreddits with Representative Words.",
                 label="table:dimension_validation",
                formatters=[format_x,format_x,format_x]
                ))


\begin{table}
\centering
\caption{Dimension Validation Against Subreddits with Representative Words.}
\label{table:dimension_validation}
\begin{tabular}{lrrr}
\toprule
alignment & Aligned &     NA & Opposing \\
dim           &         &        &          \\
\midrule
Age           &   0.255 &  0.029 &   -0.211 \\
Birth Control &   0.380 &  0.068 &   -0.150 \\
Gender        &   0.124 & -0.151 &   -0.305 \\
Gun Control   &   0.125 & -0.098 &   -0.485 \\
Ideology      &   0.295 &  0.015 &   -0.325 \\
Religiosity   &   0.199 &  0.064 &   -0.186 \\
Trade         &   0.058 & -0.189 &   -0.200 \\
War           &   0.380 &  0.003 &   -0.391 \\
Mean          &   0.227 & -0.032 &   -0.282 \\
\bottomrule
\end{tabular}
\end{table}

