# Text Vectorization and Feature Engineering Assignment

In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [33]:
import pandas as pd
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

from sklearn.feature_extraction.text import CountVectorizer

<IPython.core.display.Javascript object>

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [3]:
PATH = "cnn_lite_corpus/"
DOC_PATTERN = r".*\.txt"
cnn_corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

<IPython.core.display.Javascript object>

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [4]:
docs_raw = [cnn_corpus.raw(file_id) for file_id in cnn_corpus.fileids()]

<IPython.core.display.Javascript object>

In [39]:
docs_raw

['(CNN) - Pink has been working pretty hard and it sounds like she will be taking a step back in 2020. Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2. Pink was there to perform her song "Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been.  "We did two and a half years of [music] and Willow\'s back in school now, Jameson\'s going to start pre-school soon," Pink said. "It\'s kind of the year of the family." The star also praised her husband, with whom she will celebrate 14 years of marriage in January. "Carey has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks. "He\'s super supportive, he follows me around the world and now it\'s his turn." According to Billboard, Pink\'s Beautiful Trauma Tour ranks as the 10th highest-grossing tou

<IPython.core.display.Javascript object>

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [11]:
# word tokenize and lowercase
docs_word_tokens = [word_tokenize(doc.lower()) for doc in docs_raw]

<IPython.core.display.Javascript object>

In [32]:
docs_word_tokens

[['(',
  'cnn',
  ')',
  '-',
  'pink',
  'has',
  'been',
  'working',
  'pretty',
  'hard',
  'and',
  'it',
  'sounds',
  'like',
  'she',
  'will',
  'be',
  'taking',
  'a',
  'step',
  'back',
  'in',
  '2020.',
  'speaking',
  'with',
  '``',
  'entertainment',
  'tonight',
  "''",
  'on',
  'the',
  'country',
  'music',
  'association',
  'awards',
  'red',
  'carpet',
  ',',
  'the',
  'singer',
  'was',
  'joined',
  'by',
  'her',
  'husband',
  ',',
  'carey',
  'hart',
  ',',
  'and',
  'their',
  'kids',
  'willow',
  ',',
  '8',
  ',',
  'and',
  'jameson',
  ',',
  '2.',
  'pink',
  'was',
  'there',
  'to',
  'perform',
  'her',
  'song',
  '``',
  'love',
  'me',
  'anyway',
  "''",
  'with',
  'country',
  'star',
  'chris',
  'stapleton',
  ',',
  'and',
  'she',
  'talked',
  'about',
  'how',
  'hectic',
  'things',
  'have',
  'been',
  '.',
  '``',
  'we',
  'did',
  'two',
  'and',
  'a',
  'half',
  'years',
  'of',
  '[',
  'music',
  ']',
  'and',
  'willow

<IPython.core.display.Javascript object>

In [15]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dgump\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

<IPython.core.display.Javascript object>

In [23]:
# remoce stopwords and punctuation
docs_words_nostop = []
for doc in docs_word_tokens:
    temp = [
        doc_tokens
        for doc_tokens in doc
        if not doc_tokens in stopwords.words("english")
        if doc_tokens.isalpha() == True
    ]
    docs_words_nostop.append(temp)

<IPython.core.display.Javascript object>

In [49]:
# stemm words
stemmer = SnowballStemmer("english")

stemmed = []
for doc in docs_words_nostop:
    temp = [stemmer.stem(token) for token in doc]
    temp = " ".join(temp)
    stemmed.append(temp)

<IPython.core.display.Javascript object>

In [29]:
import nltk

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dgump\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

<IPython.core.display.Javascript object>

In [48]:
# lemmatize words
lemmatizer = WordNetLemmatizer()

lemmatized = []
for doc in docs_words_nostop:
    temp = [lemmatizer.lemmatize(token) for token in doc]
    temp = " ".join(temp)
    lemmatized.append(temp)
lemmatized

['cnn pink working pretty hard sound like taking step back speaking entertainment tonight country music association award red carpet singer joined husband carey hart kid willow jameson pink perform song love anyway country star chris stapleton talked hectic thing two half year music willow back school jameson going start soon pink said kind year family star also praised husband celebrate year marriage january carey lot going well said hart went professional motocross competitor racing truck super supportive follows around world turn according billboard pink beautiful trauma tour rank tour time earning million',
 'cnn former massachusetts gov deval patrick told friend ally phone call wednesday made decision run president two people familiar matter say plan timing formal announcement still flux wednesday attention impeachment source familiar plan tell cnn patrick officially file new hampshire primary thursday concord appears cbs morning odds could incredibly steep late entrant like patri

<IPython.core.display.Javascript object>

In [38]:
lemmatized.values()

AttributeError: 'list' object has no attribute 'values'

<IPython.core.display.Javascript object>

### Count vectorize the preprocessed documents.

In [50]:
vectorizer = CountVectorizer()

vectors = vectorizer.fit_transform(lemmatized)

# vectors = []
# for lemma_list in lemmatized:
#     vector = vectorizer.fit_transform(lemma_list)
#     vectors.append(vector)

<IPython.core.display.Javascript object>

In [42]:
count = vectors.to_array

AttributeError: 'list' object has no attribute 'to_array'

<IPython.core.display.Javascript object>

In [43]:
# convert raw docs to df and tyr it without all these lists

<IPython.core.display.Javascript object>

In [44]:
df = pd.DataFrame({"doc": docs_raw})

<IPython.core.display.Javascript object>

In [46]:
df = df["doc"].apply(word_tokenize)
df = df['doc'].apply()

0     [(, CNN, ), -, Pink, has, been, working, prett...
1     [(, CNN, ), -, Former, Massachusetts, Gov, ., ...
2     [(, CNN, ), -, There, 's, a, 10-week-old, pupp...
3     [(, CNN, ), -, Three, Democratic, heavyweights...
4     [(, CNN, ), -, The, House, Intelligence, Commi...
5     [Editor, 's, Note, :, Nadine, Jolie, Courtney,...
6     [New, York, (, CNN, Business, ), -, A, version...
7     [(, CNN, ), -, The, manhunt, for, a, Marine, d...
8     [(, CNN, ), -, The, Trump, Organization, is, c...
9     [(, CNN, ), -, Alicia, Keys, is, hosting, the,...
10    [Editor, 's, Note, :, Paul, Callan, is, a, CNN...
11    [Los, Angeles, (, CNN, ), -, A, teen, whose, b...
12    [(, CNN, ), -, Viewers, who, signed, up, to, s...
13    [Hong, Kong, (, CNN, ), -, The, threat, of, vi...
14    [Los, Angeles, (, CNN, ), -, A, teen, whose, b...
Name: doc, dtype: object

<IPython.core.display.Javascript object>

### One hot vectorize the preprocessed documents.

### TF-IDF vectorize the preprocessed documents.

### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.