In [1]:
import pandas as pd
import matplotlib.pyplot as plt

testpath = "test-data/GloWBe-and-NOW-corpus-sample.json"


from utils import read_prevert, count_variants, get_lexicon, counts_to_category

lex = get_lexicon()

df = pd.read_json(testpath)
df["variant_detector_count"] = df.text.apply(lambda s: count_variants(s, lex)[0])
df["variant_detector_breakdown"] = df.text.apply(lambda s: count_variants(s, lex)[1])
df["words"] = df.text.apply(lambda t: len(t.split()))
df["A_B"] = df.variant_detector_count.apply(lambda d:d.get("A", 0) - d.get("B", 0))
df["A_B_normalized"] = df.A_B / df.words
df["variant"] = df.variant_detector_count.apply(counts_to_category)

In [3]:
df.variant_detector_count

0         {'B': 3, 'A': 1}
1                 {'A': 3}
2         {'B': 8, 'A': 3}
3       {'A': 18, 'B': 10}
4         {'A': 1, 'B': 2}
               ...        
1440                    {}
1441                    {}
1442              {'A': 1}
1443      {'A': 4, 'B': 1}
1444              {'A': 6}
Name: variant_detector_count, Length: 1445, dtype: object

In [2]:
df.columns

Index(['id', 'words', 'country', 'website', 'title', 'text', 'lower_text',
       'corpus', 'variant_detector_count', 'variant_detector_breakdown', 'A_B',
       'A_B_normalized', 'variant'],
      dtype='object')

In [14]:
pd.set_option('max_colwidth', 400)
# subset = df.loc[df.variant_detector_breakdown != {}, ["country", "variant", "variant_detector_breakdown", "text"]]
c1 = (df.variant == "A") & (df.country == "GB")
c2 = (df.variant == "B") & (df.country == "US")
subset = df.loc[c1, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.variant_detector_breakdown.apply(lambda i: list(i.keys())) 
all_A_GB = []
for d in subset.variant_detector_breakdown:
    for w, inner in d.items():
        n = inner["count"]
        all_A_GB.extend([w]*n)
subset = df.loc[c2, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.variant_detector_breakdown.apply(lambda i: list(i.keys())) 
all_B_US = []
for d in subset.variant_detector_breakdown:
    for w, inner in d.items():
        n = inner["count"]
        all_B_US.extend([w]*n)
subset = df.loc[df.variant_detector_breakdown != {}, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.sample(5)

Unnamed: 0,country,variant,variant_detector_breakdown,text
1432,US,MIX,"{'favor': {'variant': 'A', 'count': 1}, 'film': {'variant': 'B', 'count': 1}}","<p> All of this and nothing Kerry Tribe . Installation view at the Hammer Museum , January 30-April 24 , 2011 . Photo by Brian Forrest . <p> All of this and nothing Fernando Ortega . Installation view at the Hammer Museum , January 30-April 24 , 2011 . Photo by Brian Forrest . <p> All of this and nothing is the sixth in the Hammer Museum 's biennial invitational exhibition series , which highl..."
916,GB,B,"{'oedema': {'variant': 'B', 'count': 8}, 'hospitalisation': {'variant': 'B', 'count': 1}, 'litre': {'variant': 'B', 'count': 1}, 'uraemia': {'variant': 'B', 'count': 1}, 'amongst': {'variant': 'B', 'count': 1}}","<h> Diuretics <p> Diuretics increase urine excretion and are commonly called "" water tablets "" . In general , they inhibit electrolyte reabsorption from the lumen of the nephron , increasing osmolarity and enhancing water excretion . <p> Diuretics have different clinical uses depending on their sites and mechanisms of action . The sub-classes of diuretics : <p> Thiazides ( bendroflumethiazide ..."
1013,GB,B,"{'centre': {'variant': 'B', 'count': 2}, 'fertiliser': {'variant': 'B', 'count': 2}, 'liberalisation': {'variant': 'B', 'count': 2}, 'mechanised': {'variant': 'B', 'count': 1}, 'favour': {'variant': 'B', 'count': 1}, 'organisation': {'variant': 'B', 'count': 1}, 'organise': {'variant': 'B', 'count': 1}, 'meagre': {'variant': 'B', 'count': 1}, 'equalling': {'variant': 'B', 'count': 1}, 'corn': ...","<p> The road from Cairo to Alexandria is lined with mega-farms growing strawberries , mangos and citrus fruit bound for foreign shores . In what was desert just decades ago , luxury produce now sprouts from the ground . <p> The greening of this arid landscape was part of ambitious government plans to sustainably increase Egypt ' s cultivated territory by turning swaths of sand into fertile ear..."
886,GB,B,"{'car': {'variant': 'B', 'count': 1}}","<p> Why am I listening to this ? He 's no Garvey , Malcolm . SMH To you youngers , DO NOT try to be like these guys . They spend most their lives living out teen fantasies and looking for the attention of every . <p> theory tests 100 and practicals 350 this is a process of getting your license payments taken after test is done and everyone is happy , many have been done and proof is available ..."
329,GB,B,"{'defence': {'variant': 'B', 'count': 1}, 'angry': {'variant': 'B', 'count': 1}, 'attorney': {'variant': 'A', 'count': 1}, 'organisation': {'variant': 'B', 'count': 1}}","<p> On Thursday , the same court sent him to Rawalpindi 's Adiala jail for a further two weeks , the defence lawyer Rizwan Abbasi told AFP . <p> The Mumbai attacks left 166 people dead and were blamed on the banned Pakistani militant group Lashkar-e-Taiba ( LeT ) . Pakistan 's failure either to hand over or prosecute those accused over the attack has angered India and damaged relations between..."


In [9]:
print(subset.groupby(["country", "variant"]).count()["text"].to_markdown())

|               |   text |
|:--------------|-------:|
| ('GB', 'A')   |     51 |
| ('GB', 'B')   |    471 |
| ('GB', 'MIX') |     78 |
| ('US', 'A')   |    349 |
| ('US', 'B')   |     78 |
| ('US', 'MIX') |    136 |


In [13]:
from collections import Counter
Counter(all_A_GB).most_common(20)

[('gas', 59),
 ('hood', 31),
 ('movie', 23),
 ('store', 19),
 ('center', 15),
 ('attorney', 9),
 ('centers', 7),
 ('trailer', 7),
 ('favorite', 6),
 ('subway', 5),
 ('armour', 5),
 ('diaper', 5),
 ('film', 4),
 ('stores', 4),
 ('theater', 3),
 ('theatre', 3),
 ('crazy', 3),
 ('car', 3),
 ('defense', 3),
 ('colors', 3)]

In [16]:
from collections import Counter
Counter(all_B_US).most_common(20)

[('car', 33),
 ('film', 19),
 ('shop', 14),
 ('holiday', 11),
 ('mobile', 11),
 ('films', 10),
 ('grey', 9),
 ('centre', 6),
 ('organisation', 6),
 ('behaviour', 6),
 ('shops', 6),
 ('travelled', 6),
 ('somewhere', 5),
 ('colour', 5),
 ('movie', 4),
 ('travellers', 4),
 ('organisations', 4),
 ('parlour', 4),
 ('cafe', 3),
 ('amongst', 3)]