<a href="https://colab.research.google.com/github/ElizabethGarrison/Working-With-Data-Fundamentals/blob/main/DoingTextAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Doing Text Analysis

In this assignment, we'll develop a method of content coding text data we've scraped on the web.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda

!pip install boilerpipe3
from boilerpipe.extract import Extractor

!pip install feedparser
import feedparser as fp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feedparser
  Downloading feedparser-6.0.10-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 5.3 MB/s 
[?25hCollecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6065 sha256=a8c4d6088b272cbe142da684b014866e10f9ee64aa67d2bc8b2db78eb2dc1d37
  Stored in directory: /root/.cache/pip/wheels/73/ad/a4/0dff4a6ef231fc0dfa12ffbac2a36cebfdddfe059f50e019aa
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.10 sgmllib3k-1.0.0


In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
wnl = nltk.WordNetLemmatizer()
nltk.download('stopwords')
en_stop_words = set(nltk.corpus.stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Identify a news feed to use as input data.

In [None]:
news_feed_links = []
news_news = []

FEED_URL0 = 'https://feeds.a.dj.com/rss/RSSWorldNews.xml'
news_feed0 = fp.parse(FEED_URL0)

for item in news_feed0.entries:
    news_feed_links.append(item.link)

FEED_URL1 = 'http://rss.cnn.com/rss/cnn_world.rss'
news_feed1 = fp.parse(FEED_URL1)

for item in news_feed1.entries:
    news_feed_links.append(item.link)

FEED_URL2 = 'https://www.latimes.com/world-nation/rss2.0.xml0'
news_feed2 = fp.parse(FEED_URL2)

for item in news_feed0.entries:
    news_feed_links.append(item.link)

for page in news_feed_links:
    try:
        extractor = Extractor(extractor = 'ArticleExtractor', url = page)
        news_news.append(extractor.getText())
    except:
        print('Download error: ' + page)

print(len(news_news))


41


In [None]:
data = news_news

In [None]:
sentences = nltk.sent_tokenize(str(data))
tokens = nltk.word_tokenize(str(data))
print(len(sentences))
print(len(tokens))
print(sentences)
print(sorted(tokens))

1079
29682


In [None]:
wnl = nltk.WordNetLemmatizer()
tokens2 = [t.lower() for t in tokens if len(t) > 2 and t not in en_stop_words] # Create a new list of words in the tweets by excluding the small words, which tend to be function words
lemma = [wnl.lemmatize(t) for t in tokens2]
words = lemma

Develop a set of keywords that reflect what you expect would be interesting content to code for in the news feed you identify.

In [None]:
from nltk.corpus import wordnet as wn


In [None]:
data_keywords = set(['faith', 'prosperity', 'freedom', 'happiness', 'brotherhood'])
data_labels = list(data_keywords)
keywords = {}

for term in data_keywords:
    for synset in wn.synsets(term):
        keywords[term] = synset.lemma_names()

keywords_keys = list(keywords.keys())
keyword_counts = {}

def wn_keyword_count(words):
    for key in keywords_keys:
        keyword_counts[key] = 0

    for word in words:
        for key in keywords_keys:
            for term in keywords[key]:
                if word.find(term) != -1:
                    keyword_counts[key] += 1

    return keyword_counts


In [None]:
data_keywords = set(['faith', 'prosperity', 'freedom', 'happiness', 'brotherhood'])
data_labels = list(data_keywords)

health_synonyms = []
health_antonyms = []

for syn in wn.synsets('faith'):
    for l in syn.lemmas():
        health_synonyms.append(l.name())
        if l.antonyms():
            health_antonyms.append(l.antonyms()[0].name())

if len(health_synonyms) > 0:
    if len(health_antonyms) > 0:
        health_terms = health_synonyms + health_antonyms
    elif len(health_antonyms) == 0:
        health_terms = health_synonyms
else: health_terms = ['faith']

safety_synonyms = []
safety_antonyms = []

for syn in wn.synsets('prosperity'):
    for l in syn.lemmas():
        safety_synonyms.append(l.name())
        if l.antonyms():
            safety_antonyms.append(l.antonyms()[0].name())

if len(safety_synonyms) > 0:
    if len(safety_antonyms) > 0:
        safety_terms = safety_synonyms + safety_antonyms
    elif len(safety_antonyms) == 0:
        safety_terms = safety_synonyms
else: safety_terms = ['prosperity']

politics_synonyms = []
politics_antonyms = []

for syn in wn.synsets('freedom'):
    for l in syn.lemmas():
        politics_synonyms.append(l.name())
        if l.antonyms():
            politics_antonyms.append(l.antonyms()[0].name())

if len(politics_synonyms) > 0:
    if len(politics_antonyms) > 0:
        politics_terms = politics_synonyms + politics_antonyms
    elif len(politics_antonyms) == 0:
        politics_terms = politics_synonyms
else: politics_terms = ['freedom']

economy_synonyms = []
economy_antonyms = []

for syn in wn.synsets('happiness'):
    for l in syn.lemmas():
        economy_synonyms.append(l.name())
        if l.antonyms():
            economy_antonyms.append(l.antonyms()[0].name())

if len(economy_synonyms) > 0:
    if len(economy_antonyms) > 0:
        economy_terms = economy_synonyms + economy_antonyms
    elif len(economy_antonyms) == 0:
        economy_terms = economy_synonyms
else: economy_terms = ['happiness']

community_synonyms = []
community_antonyms = []

for syn in wn.synsets('brotherhood'):
    for l in syn.lemmas():
        community_synonyms.append(l.name())
        if l.antonyms():
            community_antonyms.append(l.antonyms()[0].name())

if len(community_synonyms) > 0:
    if len(community_antonyms) > 0:
        community_terms = community_synonyms + community_antonyms
    elif len(community_antonyms) == 0:
        community_terms = community_synonyms
else: community_terms = ['brotherhood']

health_terms = set(health_terms)
safety_terms = set(safety_terms)
politics_terms = set(politics_terms)
economy_terms = set(economy_terms)
community_terms = set(community_terms)

print(health_terms)
print(safety_terms)
print(politics_terms)
print(economy_terms)
print(community_terms)

def wn_keyword_count2(words):
    health_freq = safety_freq = politics_freq = economy_freq = community_freq = 0

    for word in words:
        for term in health_terms:
            if word.find(term) != -1:
                health_freq += 1

        for term in safety_terms:
            if word.find(term) != -1:
                safety_freq += 1

        for term in politics_terms:
            if word.find(term) != -1:
                politics_freq += 1

        for term in economy_terms:
            if word.find(term) != -1:
                economy_freq += 1

        for term in community_terms:
            if word.find(term) != -1:
                community_freq += 1

    theme_freqs = [health_freq, safety_freq, politics_freq, economy_freq, community_freq]

    return theme_freqs

{'religion', 'trust', 'faith', 'organized_religion', 'religious_belief'}
{'prosperity', 'successfulness'}
{'freedom', 'exemption'}
{'sadness', 'felicity', 'unhappiness', 'happiness'}
{'fraternity', 'union', 'sodality', 'brotherhood', 'trade_union', 'labor_union', 'trades_union'}


In [None]:
print(len(words))
print(words)

results3 = pd.Series(wn_keyword_count(words), index=data_labels)
#results4 = pd.Series(wn_keyword_count2(words), index=data_labels)

print(results3)
#print(results4)

15608
faith          0
brotherhood    4
happiness      0
prosperity     0
freedom        1
dtype: int64


## Activity

Identify a news feed as input data.

Read in the input and parse the text to make it available for coding.

Use the WordNet library to do content coding, as in the blocks above.

Visualize the results in a way that makes sense.

Interpret the analysis.

In [None]:
# Section 1
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda

!pip install boilerpipe3
from boilerpipe.extract import Extractor

!pip install feedparser
import feedparser as fp

Collecting boilerpipe3
  Downloading boilerpipe3-1.3.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting JPype1 (from boilerpipe3)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting charade (from boilerpipe3)
  Downloading charade-1.0.3.tar.gz (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.5/168.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: boilerpipe3, charade
  Building wheel for boilerpipe3 (setup.py) ... [?25l[?25hdone
  Created wheel for boilerpipe3: filename=boilerpipe3-1.3-py3-none-any.whl size=1321045 sha256=3ddc7fc8

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
wnl = nltk.WordNetLemmatizer()
nltk.download('stopwords')
en_stop_words = set(nltk.corpus.stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
news_feed_links = []
news_news = []

FEED_URL0 = 'https://feeds.a.dj.com/rss/RSSWorldNews.xml'
news_feed0 = fp.parse(FEED_URL0)

for item in news_feed0.entries:
    news_feed_links.append(item.link)

FEED_URL1 = 'http://rss.cnn.com/rss/cnn_world.rss'
news_feed1 = fp.parse(FEED_URL1)

for item in news_feed1.entries:
    news_feed_links.append(item.link)

FEED_URL2 = 'https://www.latimes.com/world-nation/rss2.0.xml0'
news_feed2 = fp.parse(FEED_URL2)

for item in news_feed0.entries:
    news_feed_links.append(item.link)

for page in news_feed_links:
    try:
        extractor = Extractor(extractor = 'ArticleExtractor', url = page)
        news_news.append(extractor.getText())
    except:
        print('Download error: ' + page)

print(len(news_news))

Download error: https://www.wsj.com/articles/trump-takes-witness-stand-in-new-york-civil-fraud-trial-6e816cf2
Download error: https://www.wsj.com/articles/dating-apps-need-to-rekindle-romance-with-wall-street-625757fb
Download error: https://www.wsj.com/articles/tech-media-telecom-roundup-market-talk-4792dde6
Download error: https://www.wsj.com/articles/health-care-roundup-market-talk-d4f778c4
Download error: https://www.wsj.com/articles/u-s-diplomats-press-israel-to-pause-gaza-assault-18aadf8a
Download error: https://www.wsj.com/articles/energy-utilities-roundup-market-talk-6aeba86d
Download error: https://www.wsj.com/articles/financial-services-roundup-market-talk-ed940c8f
Download error: https://www.wsj.com/articles/auto-transport-roundup-market-talk-e34f84bd
Download error: https://www.wsj.com/articles/ai-denies-loan-application-appeal-to-human-48d18d57
Download error: https://www.wsj.com/articles/natural-gas-futures-fall-amid-warm-weather-forecast-d84c7efc
Download error: https://

In [None]:
data = news_news

In [None]:
sentences = nltk.sent_tokenize(str(data))
tokens = nltk.word_tokenize(str(data))
print(len(sentences))
print(len(tokens))
print(sentences)
print(sorted(tokens))

1131
31160


In [None]:
wnl = nltk.WordNetLemmatizer()
tokens2 = [t.lower() for t in tokens if len(t) > 2 and t not in en_stop_words] # Create a new list of words in the tweets by excluding the small words, which tend to be function words
lemma = [wnl.lemmatize(t) for t in tokens2]
words = lemma

In [None]:
from nltk.corpus import wordnet as wn


In [None]:
data_keywords = set(['good', 'sad', 'war', 'sisterhood', 'revolution'])
data_labels = list(data_keywords)
keywords = {}

for term in data_keywords:
    for synset in wn.synsets(term):
        keywords[term] = synset.lemma_names()

keywords_keys = list(keywords.keys())
keyword_counts = {}

def wn_keyword_count(words):
    for key in keywords_keys:
        keyword_counts[key] = 0

    for word in words:
        for key in keywords_keys:
            for term in keywords[key]:
                if word.find(term) != -1:
                    keyword_counts[key] += 1

    return keyword_counts

In [None]:
data_keywords = set(['good', 'sad', 'war', 'sisterhood', 'revolution'])
data_labels = list(data_keywords)

health_synonyms = []
health_antonyms = []

for syn in wn.synsets('good'):
    for l in syn.lemmas():
        health_synonyms.append(l.name())
        if l.antonyms():
            health_antonyms.append(l.antonyms()[0].name())

if len(health_synonyms) > 0:
    if len(health_antonyms) > 0:
        health_terms = health_synonyms + health_antonyms
    elif len(health_antonyms) == 0:
        health_terms = health_synonyms
else: health_terms = ['good']

safety_synonyms = []
safety_antonyms = []

for syn in wn.synsets('sad'):
    for l in syn.lemmas():
        safety_synonyms.append(l.name())
        if l.antonyms():
            safety_antonyms.append(l.antonyms()[0].name())

if len(safety_synonyms) > 0:
    if len(safety_antonyms) > 0:
        safety_terms = safety_synonyms + safety_antonyms
    elif len(safety_antonyms) == 0:
        safety_terms = safety_synonyms
else: safety_terms = ['sad']

politics_synonyms = []
politics_antonyms = []

for syn in wn.synsets('war'):
    for l in syn.lemmas():
        politics_synonyms.append(l.name())
        if l.antonyms():
            politics_antonyms.append(l.antonyms()[0].name())

if len(politics_synonyms) > 0:
    if len(politics_antonyms) > 0:
        politics_terms = politics_synonyms + politics_antonyms
    elif len(politics_antonyms) == 0:
        politics_terms = politics_synonyms
else: politics_terms = ['war']

economy_synonyms = []
economy_antonyms = []

for syn in wn.synsets('sisterhood'):
    for l in syn.lemmas():
        economy_synonyms.append(l.name())
        if l.antonyms():
            economy_antonyms.append(l.antonyms()[0].name())

if len(economy_synonyms) > 0:
    if len(economy_antonyms) > 0:
        economy_terms = economy_synonyms + economy_antonyms
    elif len(economy_antonyms) == 0:
        economy_terms = economy_synonyms
else: economy_terms = ['sisterhood']

community_synonyms = []
community_antonyms = []

for syn in wn.synsets('revolution'):
    for l in syn.lemmas():
        community_synonyms.append(l.name())
        if l.antonyms():
            community_antonyms.append(l.antonyms()[0].name())

if len(community_synonyms) > 0:
    if len(community_antonyms) > 0:
        community_terms = community_synonyms + community_antonyms
    elif len(community_antonyms) == 0:
        community_terms = community_synonyms
else: community_terms = ['revolution']

health_terms = set(health_terms)
safety_terms = set(safety_terms)
politics_terms = set(politics_terms)
economy_terms = set(economy_terms)
community_terms = set(community_terms)

print(health_terms)
print(safety_terms)
print(politics_terms)
print(economy_terms)
print(community_terms)

def wn_keyword_count2(words):
    health_freq = safety_freq = politics_freq = economy_freq = community_freq = 0

    for word in words:
        for term in health_terms:
            if word.find(term) != -1:
                health_freq += 1

        for term in safety_terms:
            if word.find(term) != -1:
                safety_freq += 1

        for term in politics_terms:
            if word.find(term) != -1:
                politics_freq += 1

        for term in economy_terms:
            if word.find(term) != -1:
                economy_freq += 1

        for term in community_terms:
            if word.find(term) != -1:
                community_freq += 1

    theme_freqs = [health_freq, safety_freq, politics_freq, economy_freq, community_freq]

    return theme_freqs

{'just', 'unspoilt', 'good', 'proficient', 'in_force', 'soundly', 'respectable', 'commodity', 'practiced', 'honest', 'undecomposed', 'ill', 'sound', 'full', 'dependable', 'evilness', 'dear', 'goodness', 'honorable', 'badness', 'skillful', 'adept', 'beneficial', 'near', 'thoroughly', 'effective', 'trade_good', 'upright', 'safe', 'skilful', 'ripe', 'expert', 'serious', 'right', 'secure', 'estimable', 'well', 'in_effect', 'salutary', 'bad', 'evil', 'unspoiled'}
{'lamentable', 'glad', 'sorry', 'sad', 'deplorable', 'distressing', 'pitiful'}
{'state_of_war', 'make_peace', 'peace', 'war', 'warfare'}
{'sisterhood', 'sistership'}
{'gyration', 'revolution', 'rotation'}


In [None]:
print(len(words))
print(words)

results3 = pd.Series(wn_keyword_count(words), index=data_labels)
#results4 = pd.Series(wn_keyword_count2(words), index=data_labels)

print(results3)
#print(results4)

15802
sisterhood     0
good          16
war           53
sad            1
revolution     1
dtype: int64
