In [None]:
import sqlite3
import pandas as pd
import nltk
import itertools
import operator
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from collections import Counter
#%pylab inline

# Initial Data Processing

In [None]:
conn = sqlite3.connect('../archives/crec.db')

## Move data from database to pandas dataframe

In [None]:
crec_df = pd.read_sql("Select * from crec", conn, index_col='UTC')

In [None]:
crec_df["html_data"] = crec_df["html_data"].str.replace('\n', ' ')

In [None]:
crec_df = crec_df.set_index(pd.DatetimeIndex(crec_df.index))#.ix[:'2016-12-31'] #control date span of df

## Reduce by keyword

In [None]:
reduced_df = crec_df[crec_df.html_data.str.contains(r'climate\schange|global\swarming')]

In [None]:
len(reduced_df)

In [None]:
reduced_df = reduced_df.set_index(pd.DatetimeIndex(reduced_df.index))

## Tokenize data for natural language analysis

In [None]:
stopwords = stopwords.words('english')

In [None]:
raw_reduced = reduced_df.html_data.tolist()

In [None]:
tokens = nltk.word_tokenize(str(raw_reduced))

In [None]:
tokenized_reduced = nltk.Text(tokens)

# Descriptive Analysis

In [None]:
len(crec_df)  # total number of entries in dataset

In [None]:
len(reduced_df)  # total number of entries in keyword-limited dataset

In [None]:
len(tokenized_reduced)  # total words

In [None]:
len(re.findall(r'(climate\schange|global\swarming)', str(raw_reduced), re.IGNORECASE)) # total words matching query

# Bigrams

### Basic NLKT Descriptives

In [None]:
tokenized_reduced.concordance("climate", lines=25)

### Bigram-based Descriptives

In [None]:
bigram_list = []
for bg in list(nltk.bigrams(tokenized_reduced)):
    if 'climate' in bg or 'warming' in bg:
        if bg[0] not in stopwords and bg[1] not in stopwords:
            bigram_list.append(bg)

In [None]:
bigrams_tagged = []
for i in bigram_list:
    bigrams_tagged.append(nltk.pos_tag(i, tagset='universal'))

In [None]:
len(bigram_list)

In [None]:
counter = Counter(bigram_list)
counter.most_common()

In [None]:
tagged_list =[]
for i in bigrams_tagged:
    if i[0][1] == 'VERB' or i[1][1] == 'VERB':
        tagged_list.append(i)

In [None]:
tagged_list

In [None]:
POS_list = []
for sublist in tagged_list:
    for sub_sublist in sublist:
        POS_list.append(sub_sublist)
POS_list

In [None]:
Counter(POS_list).most_common()

# Regex-Based Analyses

## Combat/Agency Frame

In [None]:
query1 = r'\b(climate\schange|global\swarming)\W+(?:\w+\W+){0,150}?(fight(ing)?|(battle|battling)|must act|combat(ing)?|(struggle|struggling)|(oppose|opposing)|fight(ing)?\sback|defend(ing?)|press(ing)?|push(ing)?|campaign(ing)?)|(fight(ing)?|(battle|battling)|must act|combat(ing)?|(struggle|struggling)|(oppose|opposing)|fight(ing)?\sback|defend(ing?)|press(ing)?|push(ing)?|campaign(ing)?)\W+(?:\w+\W+){0,150}?(climate\schange|global\swarming)\b'
query1_desc = '"Climate Change" collocated with combat terms'
query2 = r'\b(climate\schange|global\swarming)\W+(?:\w+\W+){0,150}?((examine|examining)|study(ing)?|assess(ing)?|model(ing)?|(measure|measuring)|(evaluate|evaluating)|(appraise|appraising))|((examine|examining)|study(ing)?|assess(ing)?|model(ing)?|(measure|measuring)|(evaluate|evaluating)|(appraise|appraising))\W+(?:\w+\W+){0,150}?(climate\schange|global\swarming)\b'
query2_desc = '"Climate Change" collocated with assessment terms'
query3 = r'\b(climate\schange|global\swarming)\W+(?:\w+\W+){0,150}?(man-made|anthropogenic|human-caused|cause(d|s)?)|(man-made|anthropogenic|human-caused|cause(d)?)\W+(?:\w+\W+){0,150}?(climate\schange|global\swarming)\b'
query3_desc = 'Agentic Ratio/Human Agency Foregrounded/Culpability Foregrounded'
query4 = r'\bc(climate\schange|global\swarming)\W+(?:\w+\W+){0,150}?(nature|natural|cycle|cyclical|slow)|(nature|natural|cycle|cyclical|slow)\W+(?:\w+\W+){0,150}?(climate\schange|global\swarming)\b'
query4_desc = 'Scenic Ratio/Nature Foregrounded/Culpability Backgrounded'

In [1]:
#NOTE: the regex queries in the cell above may be older/different than those in the final code.

In [None]:
query1_df = pd.DataFrame(index=reduced_df.index, data=reduced_df.html_data.str.count(query1, re.IGNORECASE))
query1_df.columns = [query1_desc]

In [None]:
query2_df = pd.DataFrame(index=reduced_df.index, data=reduced_df.html_data.str.count(query2, re.IGNORECASE))
query2_df.columns = [query2_desc]

In [None]:
query3_df = pd.DataFrame(index=reduced_df.index, data=reduced_df.html_data.str.count(query3, re.IGNORECASE))
query3_df.columns = [query3_desc]

In [None]:
query4_df = pd.DataFrame(index=reduced_df.index, data=reduced_df.html_data.str.count(query4, re.IGNORECASE))
query4_df.columns = [query4_desc]

In [None]:
query5_df = pd.DataFrame(index=reduced_df.index, data=reduced_df.html_data.str.count(query4, re.IGNORECASE))
query5_df.columns = [query4_desc]

In [None]:
print(query1_df.groupby(lambda x:x.year).sum())
print(query2_df.groupby(lambda x:x.year).sum())

In [None]:
fig1 = plt.figure(figsize=(15,13))

ax = fig1.add_subplot(211)
ax.set_xlabel('Year', fontweight='bold')
ax.set_ylabel('Frequency', fontweight='bold')
ax.set_title('Agentic Frames', fontweight='bold')
plot1=plt.plot(query1_df.groupby(lambda x:x.year).sum(), '--')
plot2=plt.plot(query2_df.groupby(lambda x:x.year).sum(), '-')
plt.legend((query1_desc, query2_desc), loc=2, fontsize=15)

ax = fig1.add_subplot(212)
ax.set_xlabel('Year', fontweight='bold')
ax.set_ylabel('Frequency', fontweight='bold')
ax.set_title('Pentadic Frames/Ratios', fontweight='bold')
plot3=plt.plot(query3_df.groupby(lambda x:x.year).sum(), '--')
plot4=plt.plot(query4_df.groupby(lambda x:x.year).sum())
plt.legend((query3_desc, query4_desc), loc=2, fontsize=15)

In [None]:
fig2 = plt.figure(figsize=(15,13))

plt.subplot(211)
plot1=plt.scatter(query1_df.index, query1_df['{}'.format(query1_desc)], alpha=0.5, color="#FF5500", s=query1_df['{}'.format(query1_desc)]*11)
plot2=plt.scatter(query2_df.index, query2_df['{}'.format(query2_desc)], alpha=0.3, color="#6495ED", s=query2_df['{}'.format(query2_desc)]*11)
plt.legend((plot1, plot2), (query1_desc, query2_desc), loc=2, fontsize=17)

plt.subplot(212)
plot3=plt.scatter(query3_df.index, query3_df['{}'.format(query3_desc)], alpha=0.5, color="#00FF99", s=query3_df['{}'.format(query3_desc)]*11)
plot4=plt.scatter(query4_df.index, query4_df['{}'.format(query4_desc)], alpha=0.3, color="#6495ED", s=query4_df['{}'.format(query4_desc)]*11)
plt.legend((plot3, plot4), (query3_desc, query4_desc), loc=2, fontsize=17)