In [38]:
import ast
import json
import pandas as pd

In [39]:
df_category = pd.read_csv("../data/df_category_sim.csv").drop(columns=['Unnamed: 0']).drop_duplicates(subset='Title')
df_links = pd.read_csv("../data/df_links_sim.csv").drop(columns=['Unnamed: 0']).drop_duplicates(subset='Title')
df_lda = pd.read_csv("../data/df_lda_sims.csv").drop(columns=['Unnamed: 0']).drop_duplicates(subset='Title')

In [40]:
df_category.head()

Unnamed: 0,Title,Path,Jaccard
0,"2,5-Dimethylfuran","['Global_warming', 'Climate change', 'Politics...","[1.0, 1.0, 1.0, 0.7142857142857143, 0.14285714..."
1,2-Methylfuran,"['Global_warming', 'Climate change', 'Politics...","[1.0, 1.0, 1.0, 0.7142857142857143, 0.14285714..."
2,2007–08 world food price crisis,"['Global_warming', 'Climate change', 'Politics...","[1.0, 1.0, 1.0, 0.7142857142857143, 0.14285714..."
3,ASEAN Wildlife Enforcement Network,"['Global_warming', 'Climate change', 'Politics...","[0.7142857142857143, 0.7142857142857143, 0.714..."
4,Accuracy in Media,"['Global_warming', 'Climate change', 'Climate ...","[0.14285714285714285, 0.14285714285714285, 0.1..."


In [41]:
df_links.head()

Unnamed: 0,Title,Links,Jaccard
0,"2,5-Dimethylfuran","['2,5-Dimethylfuran (data page)', 'Aqueous sol...","[1.0, 0.24719101123595505, 0.01578947368421052..."
1,2-Methylfuran,"['2-Methylfuran (data page)', 'Alternative fue...","[0.24719101123595505, 1.0, 0.00295857988165680..."
2,2007–08 world food price crisis,"['2000s commodities boom', '2000s energy crisi...","[0.015789473684210527, 0.0029585798816568047, ..."
3,ASEAN Wildlife Enforcement Network,"['ASEAN', 'ASEAN Center for Biodiversity', 'As...","[0.0, 0.0, 0.007894736842105263, 1.0, 0.014150..."
4,Accuracy in Media,"['2012 Benghazi attack', 'Advocacy journalism'...","[0.009259259259259259, 0.0058823529411764705, ..."


In [42]:
df_lda.head()

Unnamed: 0,Title,Text_processed,Dominant Topic,Sims
0,"2,5-Dimethylfuran","2,5-dimethylfuran is a heterocyclic compound w...",40,"[1.0000000000000002, 0.6208810150204145, 0.029..."
1,2-Methylfuran,"2-methylfuran, also known with the older name ...",20,"[0.6208810150204145, 1.0000000000000024, 0.001..."
2,2007–08 world food price crisis,world food prices increased dramatically in 20...,39,"[0.02914683177495991, 0.0016481418417907804, 0..."
3,ASEAN Wildlife Enforcement Network,the asean wildlife enforcement network (asean-...,3,"[0.00047928602761282814, 0.002529903057432023,..."
4,Accuracy in Media,accuracy in media (aim) is an american non-pro...,21,"[0.0005204638786978396, 0.0024441338682554127,..."


In [43]:
df_category['Jaccard'] = df_category['Jaccard'].apply(lambda x: ast.literal_eval(x))
df_links['Jaccard'] = df_links['Jaccard'].apply(lambda x: ast.literal_eval(x))
df_lda['Sims'] = df_lda['Sims'].apply(lambda x: ast.literal_eval(x))

In [44]:
data_stories = ['Methane', 'Global methane initiative', 'Methane emissions', 'Fugitive gas emissions',
                'Hydraulic fracturing', 'Environmental impact of hydraulic fracturing', 'Renewable natural gas', 
                'Biogas', 'Biofuel', 'Climate change denial', 'Fossil fuels lobby', 'Politics of global warming', 
                'Wood gas', 'Afforestation', 'Reforestation', 'Reducing emissions from deforestation and forest degradation',
                'Pollution prevention', 'Politics of global warming', 'Green economy', 'Environmental economics', 'Charcoal', 
                'Groundwater recharge', 'Green New Deal']

In [45]:
causes = ['Methane emissions', 'Hydraulic fracturing', 'Media coverage of global warming', 'Fossil fuels lobby']
consequences = ['Environmental impact of hydraulic fracturing', 'Environmental impact of hydraulic fracturing',
                'Climate change denial', 'Climate change denial']
problems = ['Methane emissions', 'Methane emissions', 'Methane emissions', 'Climate change denial']
solutions = ['Renewable natural gas', 'Biogas', 'Biofuel', 'Scientific consensus on climate change']

In [46]:
cc_relations = [(causes[i], consequences[i]) for i in range(len(causes))]
# cc_relations.extend([(consequences[i], causes[i]) for i in range(len(causes))])
ps_relations = [(problems[i], solutions[i]) for i in range(len(problems))]
# ps_relations.extend([(problems[i], solutions[i]) for i in range(len(problems))])

In [19]:
data = {'nodes': [], 'links': []}

In [20]:
for i in range(len(df_lda)):
    title = df_lda['Title'].iloc[i]
    data['nodes'].append({'id': str(i), 'label': title})
    
    lda = df_lda['Sims'].iloc[i]
    cat = df_category['Jaccard'].iloc[i]
    links = df_links['Jaccard'].iloc[i]
    
    for j in range(len(df_lda)):
        if i != j and j > i:
            title2 = df_lda['Title'].iloc[j]
            
            if (title, title2) in cc_relations or (title2, title) in cc_relations:
                link_type = 'cc'
            elif (title, title2) in ps_relations or (title2, title) in ps_relations:
                link_type = 'ps'
            else:
                link_type = 'na'
            
            data['links'].append({
                'source': str(i),
                'target': str(j),
                'type': link_type,
                'w_lda': lda[j],
                'w_cat': cat[j],
                'w_links': links[j]
            })

In [28]:
with open("../data/data_viz.json", 'w+', encoding='utf-8') as f:
    json.dump(data, f, indent='\t')

## Reduced dataset

In [63]:
cc_ps_set = set(causes + consequences + problems + solutions)

In [64]:
to_keep = df_category[df_category['Title'].isin(cc_ps_set)].drop_duplicates(subset=['Title'])

In [65]:
sampled = df_category.sample(90, random_state=10)

In [66]:
sampled_category = pd.concat([to_keep, sampled]).sort_index()

In [67]:
sampled_category.head(2)

Unnamed: 0,Title,Path,Jaccard
1,2-Methylfuran,"['Global_warming', 'Climate change', 'Politics...","[1.0, 1.0, 1.0, 0.7142857142857143, 0.14285714..."
16,Alex Epstein (American writer),"['Global_warming', 'Climate change', 'Climate ...","[0.14285714285714285, 0.14285714285714285, 0.1..."


In [68]:
indexes_to_keep = list(sampled_category.index)

In [69]:
sampled_lda = df_lda[df_lda.index.isin(indexes_to_keep)]
sampled_links = df_links[df_links.index.isin(indexes_to_keep)]

In [70]:
len(sampled_category)

100

In [71]:
len(sampled_links)

100

In [72]:
sampled_lda['Sims'] = sampled_lda['Sims'].apply(lambda l: [item for i, item in enumerate(l) if i in indexes_to_keep])
sampled_category['Jaccard'] = sampled_category['Jaccard'].apply(lambda l: [item for i, item in enumerate(l) if i in indexes_to_keep])
sampled_links['Jaccard'] = sampled_links['Jaccard'].apply(lambda l: [item for i, item in enumerate(l) if i in indexes_to_keep])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [73]:
sampled_lda = sampled_lda.reset_index()
sampled_category = sampled_category.reset_index()
sampled_links = sampled_links.reset_index()

In [74]:
sampled_lda.head(2)

Unnamed: 0,index,Title,Text_processed,Dominant Topic,Sims
0,1,2-Methylfuran,"2-methylfuran, also known with the older name ...",20,"[1.0000000000000024, 0.001803452618977931, 0.0..."
1,16,Alex Epstein (American writer),alexander joseph epstein () is an american aut...,21,"[0.001803452618977931, 1.0000000000000027, 0.3..."


In [75]:
sampled_links.head(2)

Unnamed: 0,index,Title,Links,Jaccard
0,1,2-Methylfuran,"['2-Methylfuran (data page)', 'Alternative fue...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.006944444444444444..."
1,16,Alex Epstein (American writer),"['Alex Epstein (disambiguation)', 'Atlas Shrug...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.012345679012345678..."


In [76]:
data_reduced = {'nodes': [], 'links': []}

In [77]:
for i in range(len(sampled_lda)):
    title = sampled_lda['Title'].iloc[i]
    data_reduced['nodes'].append({'id': str(i), 'label': title})
    
    lda = sampled_lda['Sims'].iloc[i]
    cat = sampled_category['Jaccard'].iloc[i]
    links = sampled_links['Jaccard'].iloc[i]
    
    for j in range(len(sampled_lda)):
        if i != j and j > i:
            title2 = sampled_lda['Title'].iloc[j]
            
            if (title, title2) in cc_relations or (title2, title) in cc_relations:
                link_type = 'cc'
            elif (title, title2) in ps_relations or (title2, title) in ps_relations:
                link_type = 'ps'
            else:
                link_type = 'na'
            
            data_reduced['links'].append({
                        'source': str(i),
                        'target': str(j),
                        'type': link_type,
                        'w_lda': lda[j],
                        'w_cat': cat[j],
                        'w_links': links[j]
                    })

In [78]:
data_reduced['nodes']

[{'id': '0', 'label': '2-Methylfuran'},
 {'id': '1', 'label': 'Alex Epstein (American writer)'},
 {'id': '2', 'label': 'American Farm Bureau Federation'},
 {'id': '3', 'label': 'An Inconvenient Truth...Or Convenient Fiction?'},
 {'id': '4', 'label': 'AquaSalina'},
 {'id': '5', 'label': 'Association of American Physicians and Surgeons'},
 {'id': '6', 'label': 'Ball clay'},
 {'id': '7', 'label': 'Beneficio'},
 {'id': '8', 'label': 'Biofuel'},
 {'id': '9', 'label': 'Biogas'},
 {'id': '10', 'label': 'Biohydrogen'},
 {'id': '11', 'label': 'CASBEE'},
 {'id': '12', 'label': 'CITES'},
 {'id': '13', 'label': 'Climate change denial'},
 {'id': '14', 'label': 'Climate change denialism'},
 {'id': '15', 'label': 'Compost'},
 {'id': '16', 'label': 'DPSIR'},
 {'id': '17', 'label': 'Dancing Rabbit Ecovillage'},
 {'id': '18', 'label': 'Darrell Issa'},
 {'id': '19', 'label': 'David Deming'},
 {'id': '20', 'label': 'EU Project Renew'},
 {'id': '21', 'label': 'Earth systems engineering and management'},
 {

In [79]:
len(data_reduced['links'])

4950

## Reduced panel data

In [80]:
import json
import wikipediaapi
import pandas as pd

wiki = wikipediaapi.Wikipedia('en')

In [81]:
titles_to_keep = sampled_lda.Title.tolist()

In [82]:
df = pd.read_pickle("../data/df_subset.pkl").sort_values(by='Title')
df = df[df.Title.isin(titles_to_keep)].reset_index().drop_duplicates(subset='Title')
df_lda_topics = pd.read_csv("../data/df_lda_topics.csv").drop(columns="Unnamed: 0")

In [83]:
len(df)

100

In [84]:
data_panel = []
for i in range(len(df)):
    title = df['Title'].iloc[i]
    page = wiki.page(title=title)
    link = page.fullurl
    topic = sampled_lda['Dominant Topic'].iloc[i]
    keywords = df_lda_topics.iloc[int(topic)].tolist()
    category = df['Category'].iloc[i]
    summary = page.summary
    data_panel.append({
        'title': title,
        'link': link,
        'topic': int(topic),
        'topic_keywords': keywords,
        'category': category,
        'summary': summary,
        'relations': {
            'causes': [],
            'consequences': [],
            'solutions': [],
            'problems': []
        }
    })

In [85]:
data_panel

[{'title': '2-Methylfuran',
  'link': 'https://en.wikipedia.org/wiki/2-Methylfuran',
  'topic': 20,
  'topic_keywords': ['use',
   'product',
   'material',
   'form',
   'make',
   'include',
   'water',
   'waste',
   'process',
   'mineral',
   'high',
   'produce',
   'light',
   'type',
   'large'],
  'category': 'Biofuels',
  'summary': '2-Methylfuran, also known with the older name of sylvane, is a flammable, water-insoluble liquid with a chocolate odor, found naturally in Myrtle and Dutch Lavender\nused as a FEMA GRAS flavoring substance, with the potential for use in alternative fuels.',
  'relations': {'causes': [],
   'consequences': [],
   'solutions': [],
   'problems': []}},
 {'title': 'Alex Epstein (American writer)',
  'link': 'https://en.wikipedia.org/wiki/Alex_Epstein_(American_writer)',
  'topic': 21,
  'topic_keywords': ['climate',
   'science',
   'say',
   'scientist',
   'report',
   'public',
   'scientific',
   'policy',
   'include',
   'state',
   'research',

In [88]:
data_reduced['panel'] = data_panel

## Save reduced data

In [89]:
with open("../data/data_viz_reduced.json", 'w+', encoding='utf-8') as f:
    json.dump(data_reduced, f, indent='\t')