In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

# Text Vectorization and Feature Engineering Assignment

In [25]:
import pandas as pd
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

<IPython.core.display.Javascript object>

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [68]:
DOC_PATTERN = r".*\.txt"
corpus = PlaintextCorpusReader("corpata/cnn_lite/", DOC_PATTERN)

<IPython.core.display.Javascript object>

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [74]:
documents = []
for fileid in corpus.fileids():
    document = corpus.raw(fileid)
    documents.append(document)

<IPython.core.display.Javascript object>

### Preprocess and clean the documents according to the steps below.

- Word Tokenize
- Lowercase
- Remove Stopwords
- Remove Punctuation
- Lemmatize
- Stem

In [75]:
no_puncts = []
for document in documents:
    all_words = word_tokenize(document)
    no_punct = [
        word.lower()
        for word in all_words
        if word.isalpha() == True
        if word.lower() not in stopwords.words("english")
    ]
    no_puncts.append(no_punct)
no_puncts

[['des',
  'moines',
  'iowa',
  'cnn',
  'alexandria',
  'capped',
  'debut',
  'iowa',
  'hustings',
  'support',
  'bernie',
  'sanders',
  'last',
  'week',
  'blunt',
  'call',
  'action',
  'something',
  'allow',
  'happen',
  'us',
  'let',
  'race',
  'happen',
  'us',
  'said',
  'rally',
  'vermont',
  'senator',
  'council',
  'bluffs',
  'watch',
  'presidential',
  'race',
  'movie',
  'movement',
  'yet',
  'certain',
  'cinematic',
  'quality',
  'last',
  'six',
  'weeks',
  'sanders',
  'second',
  'democratic',
  'presidential',
  'campaign',
  'nearly',
  'sidelined',
  'worse',
  'heart',
  'attack',
  'las',
  'vegas',
  'first',
  'night',
  'october',
  'sanders',
  'charted',
  'remarkable',
  'revival',
  'powered',
  'run',
  'invigorating',
  'endorsements',
  'new',
  'poll',
  'results',
  'showed',
  'gaining',
  'steam',
  'new',
  'hampshire',
  'iowa',
  'sense',
  'fueled',
  'part',
  'massive',
  'crowds',
  'welcomed',
  'recent',
  'rallies',
  'n

<IPython.core.display.Javascript object>

In [76]:
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
all_stemmed = []
for all_words in no_puncts:
    stemmed = [stemmer.stem(lemmatizer.lemmatize(word)) for word in all_words]
    all_stemmed.append(stemmed)
    


<IPython.core.display.Javascript object>

In [77]:
docs = []
for stemmed in all_stemmed:
    docs.append(" ".join(stemmed))

<IPython.core.display.Javascript object>

In [78]:
docs

['de moin iowa cnn alexandria cap debut iowa hust support berni sander last week blunt call action someth allow happen u let race happen u said ralli vermont senat council bluff watch presidenti race movi movement yet certain cinemat qualiti last six week sander second democrat presidenti campaign near sidelin wors heart attack la vega first night octob sander chart remark reviv power run invigor endors new poll result show gain steam new hampshir iowa sens fuel part massiv crowd welcom recent ralli new york minnesota polit revolut tri summer back march also sander joke follow saturday climat summit de moin stent thank got three arteri work right pretti good deadpan practic jumper basketbal court drake univers better one block arteri feel realli good support staff say much back along fellow squad member ilhan omar rashida tlaib news broke recent debat last month ohio bolster argument sander make month campaign steadili attract racial divers young work class coalit readi make unapologet

<IPython.core.display.Javascript object>

### Count vectorize the preprocessed documents.

In [79]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(docs)

count = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())

count.head()

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,...,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

### One hot vectorize the preprocessed documents.

In [80]:
vectorizer = CountVectorizer(binary=True)
vectors = vectorizer.fit_transform(docs)

one_hot = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())

one_hot.head()

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,...,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


<IPython.core.display.Javascript object>

### TF-IDF vectorize the preprocessed documents.

In [81]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(docs)

tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())

tfidf.head()

Unnamed: 0,aaron,aback,abc,abet,abil,abl,abroad,absolut,absorb,absorpt,...,yudkin,yuriy,zaia,zaid,zakaria,zanoni,zelenski,zero,zone,édouard
0,0.0,0.0,0.0,0.0,0.023331,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.017877,0.0,0.0,0.0,0.01172,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035754,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028956,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.029829,0.0,0.0,0.0,0.045498,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<IPython.core.display.Javascript object>

### Use Doc2Vec to vectorize the preprocessed documents.

Set the size of the vectors to be the same size as those of the other methods using the `vector_size` argument.

In [82]:
documents = [TaggedDocument(doc, [i]) 
             for i, doc in enumerate(docs)]

model = Doc2Vec(documents, vector_size=one_hot.shape[1])

# model.vector_size=one_hot.shape[1]

doc2vec = pd.DataFrame([[document]+list(model[document]) 
                        for document in range(len(docs))]).drop(0, axis=1)




<IPython.core.display.Javascript object>

In [85]:
one_hot.shape

(57, 4654)

<IPython.core.display.Javascript object>

In [83]:
doc2vec.shape

(57, 4654)

<IPython.core.display.Javascript object>

In [84]:
doc2vec.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,4645,4646,4647,4648,4649,4650,4651,4652,4653,4654
0,0.009167,0.001257,0.013268,-0.02084,-0.022429,-0.008614,0.009971,-0.013197,0.003911,0.008711,...,0.018538,-0.005177,-0.009473,0.011369,-0.00799,-0.003537,-0.018905,-0.00764,-0.007621,-0.001164
1,0.008053,0.000932,0.011,-0.016864,-0.019778,-0.007767,0.008279,-0.011168,0.003203,0.007693,...,0.01795,-0.004846,-0.008694,0.010423,-0.007977,-0.003161,-0.018741,-0.007585,-0.007468,-0.00135
2,0.01006,0.001283,0.014496,-0.022617,-0.024807,-0.009636,0.010672,-0.01458,0.004419,0.009715,...,0.017142,-0.004742,-0.008729,0.01036,-0.007459,-0.003369,-0.017599,-0.006936,-0.007044,-0.001003
3,0.009408,0.001281,0.013939,-0.021493,-0.024311,-0.009483,0.010265,-0.013843,0.003905,0.009199,...,0.020554,-0.005769,-0.010375,0.012335,-0.009066,-0.003933,-0.021291,-0.008533,-0.008507,-0.001569
4,0.009797,0.001195,0.013644,-0.020975,-0.024502,-0.009701,0.010283,-0.014204,0.004353,0.009522,...,0.021416,-0.005923,-0.010395,0.012377,-0.009536,-0.004003,-0.022091,-0.008892,-0.008968,-0.001597


<IPython.core.display.Javascript object>