In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Text Data Cleaning and Preprocessing Assignment

In [2]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

<IPython.core.display.Javascript object>

In [3]:
def corpus_stats(corpus):
    print("Corpus Statistics")
    print("Number of documents: " + str(len(corpus.fileids())))
    print("Number of paragraphs: " + str(len(corpus.paras())))
    print("Number of sentences: " + str(len(corpus.sents())))
    print("Number of words: " + str(len(corpus.words())))
    print("Vocabulary: " + str(len(set(w.lower() for w in corpus.words()))))
    print(
        "Avg chars per word: " + str(round(len(corpus.raw()) / len(corpus.words()), 1))
    )
    print(
        "Avg words per sentence: "
        + str(round(len(corpus.words()) / len(corpus.sents()), 1))
    )

<IPython.core.display.Javascript object>

### Read the O'Reilly RSS plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
DOC_PATTERN = r".*\.txt"
corpus = PlaintextCorpusReader("corpata/o_reilly/", DOC_PATTERN)
corpus_stats(corpus)

Corpus Statistics
Number of documents: 60
Number of paragraphs: 60
Number of sentences: 190
Number of words: 4476
Vocabulary: 1440
Avg chars per word: 5.0
Avg words per sentence: 23.6


<IPython.core.display.Javascript object>

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [5]:
docs = []
for fileid in corpus.fileids():
    doc = corpus.raw(fileid)
    docs.append(doc)

<IPython.core.display.Javascript object>

In [6]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]  # same as above

<IPython.core.display.Javascript object>

In [7]:
docs

['Perhaps the most important event this month isn’t technical, but the start of the US Justice Dept.’s lawsuit against Google. That will certainly play out over years rather than months, but it’s significance is less about this particular case than the idea that legal and regulatory systems will play a large role in the evolution [&#8230;]',
 'Mutation Testing &#8212; in this paper, we semi-automatically learn error-inducing patterns from a corpus of common Java coding errors and from changes that caused operational anomalies at Facebook specifically. We combine the mutations with instrumentation that measures which tests exactly visited the mutated piece of code. Results on more than 15,000 generated mutants show that [&#8230;]',
 'Algorithms Can Collude &#8212; To analyze the possible consequences, we study experimentally the behavior of algorithms powered by Artificial Intelligence (Q-learning) in a workhorse oligopoly model of repeated price competition. We find that the algorithms

<IPython.core.display.Javascript object>

### Sentence tokenize each document in the list of documents.

In [8]:
sents = sent_tokenize(docs[0])
words = [word_tokenize(sent) for sent in sents]
words

[['Perhaps',
  'the',
  'most',
  'important',
  'event',
  'this',
  'month',
  'isn',
  '’',
  't',
  'technical',
  ',',
  'but',
  'the',
  'start',
  'of',
  'the',
  'US',
  'Justice',
  'Dept.',
  '’',
  's',
  'lawsuit',
  'against',
  'Google',
  '.'],
 ['That',
  'will',
  'certainly',
  'play',
  'out',
  'over',
  'years',
  'rather',
  'than',
  'months',
  ',',
  'but',
  'it',
  '’',
  's',
  'significance',
  'is',
  'less',
  'about',
  'this',
  'particular',
  'case',
  'than',
  'the',
  'idea',
  'that',
  'legal',
  'and',
  'regulatory',
  'systems',
  'will',
  'play',
  'a',
  'large',
  'role',
  'in',
  'the',
  'evolution',
  '[',
  '&',
  '#',
  '8230',
  ';',
  ']']]

<IPython.core.display.Javascript object>

### Word tokenize each sentence within each document.

You should end up with a nested list structure where the outer list contains all the sentences in each document and the inner list contains the tokenized sentences.

In [9]:
tokenized = []
for doc in docs:
    sents = sent_tokenize(doc)
    words = [word_tokenize(sent) for sent in sents]
    tokenized.append(words)
tokenized

[[['Perhaps',
   'the',
   'most',
   'important',
   'event',
   'this',
   'month',
   'isn',
   '’',
   't',
   'technical',
   ',',
   'but',
   'the',
   'start',
   'of',
   'the',
   'US',
   'Justice',
   'Dept.',
   '’',
   's',
   'lawsuit',
   'against',
   'Google',
   '.'],
  ['That',
   'will',
   'certainly',
   'play',
   'out',
   'over',
   'years',
   'rather',
   'than',
   'months',
   ',',
   'but',
   'it',
   '’',
   's',
   'significance',
   'is',
   'less',
   'about',
   'this',
   'particular',
   'case',
   'than',
   'the',
   'idea',
   'that',
   'legal',
   'and',
   'regulatory',
   'systems',
   'will',
   'play',
   'a',
   'large',
   'role',
   'in',
   'the',
   'evolution',
   '[',
   '&',
   '#',
   '8230',
   ';',
   ']']],
 [['Mutation',
   'Testing',
   '&',
   '#',
   '8212',
   ';',
   'in',
   'this',
   'paper',
   ',',
   'we',
   'semi-automatically',
   'learn',
   'error-inducing',
   'patterns',
   'from',
   'a',
   'corpus',
   'o

<IPython.core.display.Javascript object>

### Tag each token with its part of speech.

In [11]:
pos_tagged = tokenized.copy()
# pos_tagged = []
for i, doc in enumerate(tokenized):
    for j, sent in enumerate(doc):
        pos_tagged[i][j] = pos_tag(sent)
#         print(pos_tag(sent))

<IPython.core.display.Javascript object>

In [12]:
pos_tagged[1]

[[('Mutation', 'NNP'),
  ('Testing', 'NNP'),
  ('&', 'CC'),
  ('#', '#'),
  ('8212', 'CD'),
  (';', ':'),
  ('in', 'IN'),
  ('this', 'DT'),
  ('paper', 'NN'),
  (',', ','),
  ('we', 'PRP'),
  ('semi-automatically', 'RB'),
  ('learn', 'VBD'),
  ('error-inducing', 'JJ'),
  ('patterns', 'NNS'),
  ('from', 'IN'),
  ('a', 'DT'),
  ('corpus', 'NN'),
  ('of', 'IN'),
  ('common', 'JJ'),
  ('Java', 'NNP'),
  ('coding', 'NN'),
  ('errors', 'NNS'),
  ('and', 'CC'),
  ('from', 'IN'),
  ('changes', 'NNS'),
  ('that', 'WDT'),
  ('caused', 'VBD'),
  ('operational', 'JJ'),
  ('anomalies', 'NNS'),
  ('at', 'IN'),
  ('Facebook', 'NNP'),
  ('specifically', 'RB'),
  ('.', '.')],
 [('We', 'PRP'),
  ('combine', 'VBP'),
  ('the', 'DT'),
  ('mutations', 'NNS'),
  ('with', 'IN'),
  ('instrumentation', 'NN'),
  ('that', 'WDT'),
  ('measures', 'VBZ'),
  ('which', 'WDT'),
  ('tests', 'VBZ'),
  ('exactly', 'RB'),
  ('visited', 'VBN'),
  ('the', 'DT'),
  ('mutated', 'JJ'),
  ('piece', 'NN'),
  ('of', 'IN'),
  ('cod

<IPython.core.display.Javascript object>

### Word tokenize the raw text of each document and remove stop words.

In [22]:
no_stops = []
for doc in docs:
    all_words = word_tokenize(doc)
    no_stop = [
        word.lower()
        for word in all_words
        if word.lower() not in stopwords.words("english")
    ]
    no_stops.append(no_stop)

<IPython.core.display.Javascript object>

```python
no_punct = [word for word in all_words if word.isalpha()==True if word.lower() not in stopwords.words("english")]
```

In [29]:
no_puncts = []
for doc in docs:
    all_words = word_tokenize(doc)
    no_punct = [
        word.lower()
        for word in all_words
        if word.isalpha() == True
        if word.lower() not in stopwords.words("english")
    ]
    no_puncts.append(no_punct)

<IPython.core.display.Javascript object>

In [30]:
no_puncts

[['perhaps',
  'important',
  'event',
  'month',
  'technical',
  'start',
  'us',
  'justice',
  'lawsuit',
  'google',
  'certainly',
  'play',
  'years',
  'rather',
  'months',
  'significance',
  'less',
  'particular',
  'case',
  'idea',
  'legal',
  'regulatory',
  'systems',
  'play',
  'large',
  'role',
  'evolution'],
 ['mutation',
  'testing',
  'paper',
  'learn',
  'patterns',
  'corpus',
  'common',
  'java',
  'coding',
  'errors',
  'changes',
  'caused',
  'operational',
  'anomalies',
  'facebook',
  'specifically',
  'combine',
  'mutations',
  'instrumentation',
  'measures',
  'tests',
  'exactly',
  'visited',
  'mutated',
  'piece',
  'code',
  'results',
  'generated',
  'mutants',
  'show'],
 ['algorithms',
  'collude',
  'analyze',
  'possible',
  'consequences',
  'study',
  'experimentally',
  'behavior',
  'algorithms',
  'powered',
  'artificial',
  'intelligence',
  'workhorse',
  'oligopoly',
  'model',
  'repeated',
  'price',
  'competition',
  'fin

<IPython.core.display.Javascript object>

### For every document, stem all the words in the document.

In [32]:
stemmer = SnowballStemmer("english")
all_stemmed = []
for all_words in no_puncts:
    stemmed = [stemmer.stem(word) for word in all_words]
    all_stemmed.append(stemmed)

<IPython.core.display.Javascript object>

In [33]:
all_stemmed

[['perhap',
  'import',
  'event',
  'month',
  'technic',
  'start',
  'us',
  'justic',
  'lawsuit',
  'googl',
  'certain',
  'play',
  'year',
  'rather',
  'month',
  'signific',
  'less',
  'particular',
  'case',
  'idea',
  'legal',
  'regulatori',
  'system',
  'play',
  'larg',
  'role',
  'evolut'],
 ['mutat',
  'test',
  'paper',
  'learn',
  'pattern',
  'corpus',
  'common',
  'java',
  'code',
  'error',
  'chang',
  'caus',
  'oper',
  'anomali',
  'facebook',
  'specif',
  'combin',
  'mutat',
  'instrument',
  'measur',
  'test',
  'exact',
  'visit',
  'mutat',
  'piec',
  'code',
  'result',
  'generat',
  'mutant',
  'show'],
 ['algorithm',
  'collud',
  'analyz',
  'possibl',
  'consequ',
  'studi',
  'experiment',
  'behavior',
  'algorithm',
  'power',
  'artifici',
  'intellig',
  'workhors',
  'oligopoli',
  'model',
  'repeat',
  'price',
  'competit',
  'find',
  'algorithm',
  'consist',
  'learn',
  'charg',
  'supracompetit',
  'price',
  'without',
  'co

<IPython.core.display.Javascript object>

In [36]:
lemmatizer = WordNetLemmatizer()
all_lemmas = []
for all_words in no_puncts:
    lemmas = [lemmatizer.lemmatize(word) for word in all_words]
    all_lemmas.append(lemmas)

<IPython.core.display.Javascript object>

In [37]:
all_lemmas

[['perhaps',
  'important',
  'event',
  'month',
  'technical',
  'start',
  'u',
  'justice',
  'lawsuit',
  'google',
  'certainly',
  'play',
  'year',
  'rather',
  'month',
  'significance',
  'le',
  'particular',
  'case',
  'idea',
  'legal',
  'regulatory',
  'system',
  'play',
  'large',
  'role',
  'evolution'],
 ['mutation',
  'testing',
  'paper',
  'learn',
  'pattern',
  'corpus',
  'common',
  'java',
  'coding',
  'error',
  'change',
  'caused',
  'operational',
  'anomaly',
  'facebook',
  'specifically',
  'combine',
  'mutation',
  'instrumentation',
  'measure',
  'test',
  'exactly',
  'visited',
  'mutated',
  'piece',
  'code',
  'result',
  'generated',
  'mutant',
  'show'],
 ['algorithm',
  'collude',
  'analyze',
  'possible',
  'consequence',
  'study',
  'experimentally',
  'behavior',
  'algorithm',
  'powered',
  'artificial',
  'intelligence',
  'workhorse',
  'oligopoly',
  'model',
  'repeated',
  'price',
  'competition',
  'find',
  'algorithm',


<IPython.core.display.Javascript object>

### Iterate through each document, computing and printing the following document statistics for each.

- Number of sentences
- Average words per sentence
- Vocabulary
- Lexical Diversity

In [45]:
for doc in docs:
    num_sentences = len(sent_tokenize(doc))
    tokenized = word_tokenize(doc)
    avg_words = len(tokenized) / num_sentences
    vocab = len(set([w.lower() for w in tokenized]))
    lex_div = vocab / len(tokenized)

    print("Number of sentences:", num_sentences)
    print("Avg. words per sentence:", avg_words)
    print("Unique words (vocabulary):", vocab)
    print("Lexical Diversity:", lex_div)
    print("-------------------------------")

Number of sentences: 2
Avg. words per sentence: 35.0
Unique words (vocabulary): 56
Lexical Diversity: 0.8
-------------------------------
Number of sentences: 3
Avg. words per sentence: 22.333333333333332
Unique words (vocabulary): 57
Lexical Diversity: 0.8507462686567164
-------------------------------
Number of sentences: 3
Avg. words per sentence: 23.333333333333332
Unique words (vocabulary): 54
Lexical Diversity: 0.7714285714285715
-------------------------------
Number of sentences: 2
Avg. words per sentence: 36.5
Unique words (vocabulary): 54
Lexical Diversity: 0.7397260273972602
-------------------------------
Number of sentences: 4
Avg. words per sentence: 17.25
Unique words (vocabulary): 52
Lexical Diversity: 0.7536231884057971
-------------------------------
Number of sentences: 4
Avg. words per sentence: 18.5
Unique words (vocabulary): 60
Lexical Diversity: 0.8108108108108109
-------------------------------
Number of sentences: 3
Avg. words per sentence: 25.0
Unique words (v

<IPython.core.display.Javascript object>