# IDC4022C Module 9

In [None]:
import pandas as pd

In [None]:
# original articles files used UTF-8, modified this to
# ISO-8859-1 to prevent line termination errors.
# Filtered data locally to reencode, then
# had to add low_memory=False to avoid dtype warn,
# this allows Pandas to read more of the file to
# infer types
df = pd.read_csv('/content/idc4022cMod9Data.csv',index_col=0,
                 encoding='ISO-8859-1',low_memory=False)

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,7))
df.publication.value_counts().plot(kind='bar')

In [None]:
import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
# this step takes a minute
doc_pre = df['content']
# Convert all values in 'content' to strings
doc_pre = df['content'].astype(str)
doc_pre.info()
doc = [nlp(text) for text in doc_pre]

In [None]:
# this step also takes a minute
concatenated_text = " ".join(doc_pre)
nlp.max_length = 2500000
single_doc = nlp(concatenated_text)
#for single_doc in doc:
#    print(single_doc.ents)  # This will print the entities for each document

In [None]:
displacy.render(single_doc,style='ent',jupyter=True)
#for single_doc in doc:
#    displacy.render(single_doc, style='ent', jupyter=True)

In [None]:
nlp = spacy.load("en_core_web_sm",
                 disable=['parser',
                          'tagger',
                          'textcat'])

In [None]:
from tqdm.notebook import tqdm

In [None]:
print(spacy.__version__)

import warnings

# [DS] Suppress the SpaCy UserWarning W108
# This specifies the pipeline includes components that assign POS tags,
# which are essential for accurate lemmatization. There may be a deprecated
# component in the code.

warnings.filterwarnings("ignore", category=UserWarning, message=r'.*\[W108\].*')

frames = []
# textbook has us loop through 1000, but we only have 495, use what we have
# refer to the text for explainations for each #n comment
for i in tqdm(range(1,495)):
    doc = df.loc[i,'content']                              #1
    text_id = df.loc[i,'id']                               #2
    doc = nlp(doc)                                         #3
    ents = [(e.text, e.start_char, e.end_char, e.label_)   #4
            for e in doc.ents
            if len(e.text.strip(' -—')) > 0]
    frame = pd.DataFrame(ents)                             #5
    frame['id'] = text_id                                  #6
    frames.append(frame)                                   #7

npf = pd.concat(frames)                                    #8

npf.columns = ['Text','Start','Stop','Type','id']          #9

In [None]:
npf = pd.concat(frames)

In [None]:
print('length is ', len(npf))
print('first 5:')
print(npf.head())
print('last 5:')
print(npf.tail())

In [None]:
npf.columns = ['Text','Start','Stop','Type','id']

In [None]:
plt.figure(figsize=(10,7))
npf.Type.value_counts().plot(kind='bar')

In [None]:
orgs = npf[npf.Type == 'ORG']

In [None]:
plt.figure(figsize=(10,7))
orgs.Text.value_counts()[:15].plot(kind='bar')