In [1]:
import pandas as pd
import matplotlib.pyplot as plt

testpath = "test-data/GloWBe-and-NOW-corpus-sample.json"


from utils import read_prevert, count_variants, get_lexicon, counts_to_category

lex = get_lexicon()

df = pd.read_json(testpath)
df["variant_detector_count"] = df.text.apply(lambda s: count_variants(s, lex)[0])
df["variant_detector_breakdown"] = df.text.apply(lambda s: count_variants(s, lex)[1])
df["words"] = df.text.apply(lambda t: len(t.split()))
df["A_B"] = df.variant_detector_count.apply(lambda d:d.get("A", 0) - d.get("B", 0))
df["A_B_normalized"] = df.A_B / df.words
df["variant"] = df.variant_detector_count.apply(counts_to_category)

In [2]:
pd.set_option('max_colwidth', 400)
# subset = df.loc[df.variant_detector_breakdown != {}, ["country", "variant", "variant_detector_breakdown", "text"]]
c1 = (df.variant == "A") & (df.country == "GB")
c2 = (df.variant == "B") & (df.country == "US")
subset = df.loc[c1, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.variant_detector_breakdown.apply(lambda i: list(i.keys())) 
all_A_GB = []
for d in subset.variant_detector_breakdown:
    for w, inner in d.items():
        n = inner["count"]
        all_A_GB.extend([w]*n)
subset = df.loc[c2, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.variant_detector_breakdown.apply(lambda i: list(i.keys())) 
all_B_US = []
for d in subset.variant_detector_breakdown:
    for w, inner in d.items():
        n = inner["count"]
        all_B_US.extend([w]*n)
subset = df.loc[df.variant_detector_breakdown != {}, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.sample(5)

Unnamed: 0,country,variant,variant_detector_breakdown,text
740,GB,B,"{'labour': {'variant': 'B', 'count': 2}, 'minimise': {'variant': 'B', 'count': 1}}","<p> The billionaire hoping to become Britain 's biggest fracker has said banning shale gas would cement the decline of UK manufacturing , as he brushed off environmental concerns about the hotly disputed energy source . <p> Asked about the impact fracking could have , Ratcliffe said : "" I 'm from the north and there are parts of the north that are not happy places . "" He added that some towns ..."
1115,US,A,"{'revelers': {'variant': 'A', 'count': 1}, 'favorite': {'variant': 'A', 'count': 1}}","<h> The Lone Writer <h> A Bit of Seasonal Promotion <p> Halloween is approaching , and I shudder to think of any reader being without a fistful of eerie stories to see them through that creepiest of nights . I want to do my humble bit to provide a shiver or two , and so I offer this promotion of my book , Wilderness : A Collection of Dark Tales , to encourage those in need to fortify themselve..."
357,GB,B,"{'organisation': {'variant': 'B', 'count': 4}, 'personalised': {'variant': 'B', 'count': 1}}",<h> Tech City UK was given the power to endorse up to 200 overseas individuals each year with the Exceptional Talent Visa but figures obtained by Techworld reveal that it 's endorsed just seven since last April . <p> Sam Shead joined Techworld as a reporter in July 2013 . He studied Geography with Science Communication at Royal Holloway University before completing a postgraduate diploma in jo...
662,GB,B,"{'publicise': {'variant': 'B', 'count': 1}}","<h> Sir Cliff Richard is to face no further action following the South Yorkshire Police investigation into allegations of historical sexual abuse , it has been announced . <p> * August 14 - Sir Cliff 's property in Sunningdale , Berkshire , is raided by South Yorkshire Police following an allegation of a sex crime involving a young boy in the 1980s . The police 's handling of the raid attracts..."
222,US,A,"{'specialty': {'variant': 'A', 'count': 2}, 'honor': {'variant': 'A', 'count': 1}, 'specialties': {'variant': 'A', 'count': 2}}","<h> The Leading Physicians of the World and International Association of Chiropractors Will Place Their Top Doctors in this Year 's Edition <h> Share Article <h> Doctors from the International Association of Chiropractors will be featured in this year 's edition of The Leading Physicians of the World . With a long track record of selecting the most esteemed , and up and coming physicians from ..."


In [7]:
subset = df.loc[:, ["country", "variant", "variant_detector_breakdown", "text"]]

print(subset.groupby(["country", "variant"]).count()["text"].to_markdown())

|               |   text |
|:--------------|-------:|
| ('GB', 'A')   |     50 |
| ('GB', 'B')   |    459 |
| ('GB', 'MIX') |     32 |
| ('GB', 'UNK') |    183 |
| ('US', 'A')   |    404 |
| ('US', 'B')   |     43 |
| ('US', 'MIX') |     25 |
| ('US', 'UNK') |    249 |


In [4]:
from collections import Counter
Counter(all_A_GB).most_common(10)

[('movie', 38),
 ('center', 28),
 ('favorite', 9),
 ('centers', 8),
 ('movies', 6),
 ('diaper', 5),
 ('harbor', 5),
 ('colors', 4),
 ('theater', 3),
 ('theatre', 3)]

In [5]:
from collections import Counter
Counter(all_B_US).most_common(10)

[('grey', 9),
 ('centre', 7),
 ('travelled', 7),
 ('organisation', 6),
 ('behaviour', 6),
 ('amongst', 5),
 ('colour', 5),
 ('travellers', 4),
 ('organisations', 4),
 ('travelling', 4)]

In [9]:
lex["labelled"]

'B'