In [1]:
import pandas as pd
import matplotlib.pyplot as plt

testpath = "test-data/GloWBe-and-NOW-corpus-sample.json"


from utils import read_prevert, count_variants, get_lexicon, counts_to_category

lex = get_lexicon()

df = pd.read_json(testpath)
df["variant_detector_count"] = df.text.apply(lambda s: count_variants(s, lex)[0])
df["variant_detector_breakdown"] = df.text.apply(lambda s: count_variants(s, lex)[1])
df["words"] = df.text.apply(lambda t: len(t.split()))
df["A_B"] = df.variant_detector_count.apply(lambda d:d.get("A", 0) - d.get("B", 0))
df["A_B_normalized"] = df.A_B / df.words
df["variant"] = df.variant_detector_count.apply(counts_to_category)

In [2]:
pd.set_option('max_colwidth', 400)
# subset = df.loc[df.variant_detector_breakdown != {}, ["country", "variant", "variant_detector_breakdown", "text"]]
c1 = (df.variant == "A") & (df.country == "GB")
c2 = (df.variant == "B") & (df.country == "US")
subset = df.loc[c1, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.variant_detector_breakdown.apply(lambda i: list(i.keys())) 
all_A_GB = []
for d in subset.variant_detector_breakdown:
    for w, inner in d.items():
        n = inner["count"]
        all_A_GB.extend([w]*n)
subset = df.loc[c2, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.variant_detector_breakdown.apply(lambda i: list(i.keys())) 
all_B_US = []
for d in subset.variant_detector_breakdown:
    for w, inner in d.items():
        n = inner["count"]
        all_B_US.extend([w]*n)
subset = df.loc[df.variant_detector_breakdown != {}, ["country", "variant", "variant_detector_breakdown", "text"]]
subset.sample(5)

Unnamed: 0,country,variant,variant_detector_breakdown,text
954,GB,B,"{'colours': {'variant': 'B', 'count': 1}, 'autumn': {'variant': 'B', 'count': 1}, 'installment': {'variant': 'A', 'count': 1}, 'film': {'variant': 'B', 'count': 2}}","<p> Returning in 2012 with his fourth artist album , "" The Agony &; The Ecstasy "" , High Contrast is set to reinstate his reputation at the top table with this superb twelve-track long player . <p> High Contrast himself describes The Agony &; The Ecstasy as more personal than any other album he s made , so prepare to be moved with some heartfelt melodies as well as the uplifting anthems which ..."
384,US,A,"{'maneuver': {'variant': 'A', 'count': 1}, 'traveled': {'variant': 'A', 'count': 2}, 'kilometers': {'variant': 'A', 'count': 1}, 'defense': {'variant': 'A', 'count': 1}}","<h> Ask Congress <p> A forthcoming report by the congressional U.S.-China Economic and Security Review Commission provides new details of China 's space-weapons programs , dubbed counterspace arms , that are aimed at destroying or jamming U.S. satellites and limiting American combat operations around the world . <p> Two direct-ascent missiles capable of hitting satellites in both lower and hig..."
466,US,A,"{'center': {'variant': 'A', 'count': 1}, 'behavioral': {'variant': 'A', 'count': 2}}","<h> Feds pay for study of gay men 's penis sizes <p> Kajsa Westman , second from right , of Stockholm , Sweden , and Victor Ng , right , of Seattle , tie balloons to a banner in preparation for the annual Gay Pride parade , Sunday , June 26 , 2011 in New York . One of the world 's oldest and largest gay pride parades was expected to become a victory celebration Sunday after New York 's histori..."
1337,US,A,"{'judgment': {'variant': 'A', 'count': 6}, 'judgments': {'variant': 'A', 'count': 4}, 'truck': {'variant': 'A', 'count': 1}, 'mitre': {'variant': 'B', 'count': 1}, 'favorite': {'variant': 'A', 'count': 1}, 'honourable': {'variant': 'B', 'count': 3}, 'honour': {'variant': 'B', 'count': 3}, 'honorable': {'variant': 'A', 'count': 1}, 'honor': {'variant': 'A', 'count': 2}, 'movie': {'variant': 'A'...","<h> How to Keep the Sabbath Day Holy <p> Sabbath-keeping reflects our relationship with God . How can we recapture positive principles and grow in this area ? <p> Exodus 19 and 20 talk about how God , when He gave the Ten Commandments , caused the earth to tremble and shake -- the manifestations were absolutely awesome . God said He came to test them "" that His fear may be before you , so that..."
883,GB,B,"{'labour': {'variant': 'B', 'count': 7}, 'analysed': {'variant': 'B', 'count': 1}, 'sizeable': {'variant': 'B', 'count': 1}, 'jeopardising': {'variant': 'B', 'count': 1}, 'decentralise': {'variant': 'B', 'count': 1}, 'decentralised': {'variant': 'B', 'count': 1}, 'favour': {'variant': 'B', 'count': 1}, 'shop': {'variant': 'B', 'count': 1}, 'modelling': {'variant': 'B', 'count': 1}, 'labelled':...","<p> A bad day for the Lib Dems , but not unexpectedly so . Call it sanguine , call it resigned ... <h> Lib Dems Corby pressed <p> The party expected to get squeezed in Corby , and we were . I suspect we lost some ' none of the above ' voters to Ukip and some left-leaning liberals to Labour ( and many others who just did n't vote ) . To forfeit our deposit by barely more than a dozen votes adde..."


In [3]:
print(subset.groupby(["country", "variant"]).count()["text"].to_markdown())

|               |   text |
|:--------------|-------:|
| ('GB', 'A')   |     51 |
| ('GB', 'B')   |    471 |
| ('GB', 'MIX') |     78 |
| ('US', 'A')   |    349 |
| ('US', 'B')   |     78 |
| ('US', 'MIX') |    136 |


In [6]:
from collections import Counter
Counter(all_A_GB).most_common(10)

[('gas', 59),
 ('hood', 31),
 ('movie', 23),
 ('store', 19),
 ('center', 15),
 ('attorney', 9),
 ('centers', 7),
 ('trailer', 7),
 ('favorite', 6),
 ('subway', 5)]

In [7]:
from collections import Counter
Counter(all_B_US).most_common(10)

[('car', 33),
 ('film', 19),
 ('shop', 14),
 ('holiday', 11),
 ('mobile', 11),
 ('films', 10),
 ('grey', 9),
 ('centre', 6),
 ('organisation', 6),
 ('behaviour', 6)]