# BAG OF WORDS STEPS

## Source: Sentdex
### Author: Himmet 

In [None]:
# This model focuses completely on the words, or sometimes a string of words, 
# but usually pays no attention to the "context" so-to-speak. The bag of words model usually has a large list, 
# probably better thought of as a sort of "dictionary," which are considered to be words that carry sentiment. 
# These words each have their own "value" when found in text. The values are typically all added up and the result is a 
# sentiment valuation. The equation to add and derive a number can vary, but this model mainly focuses on the words, 
# and makes no attempt to actually understand language fundamentals.

# source: http://sentdex.com/sentiment-analysis/

### Lets Start:

In [87]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# TOKENIZE

In [None]:
# Tokenize = seperate text (ex: words or sentences)
# Corpera = body of text (ex: twtter, news, etc.)
# Lexicon = words and their means (ex: emotion, sentiment)

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [9]:
# Source crowdflower_tweets.xlxs
ex_text = "@BrodyJenner if u watch the hills in london u will realise what tourture it is because were weeks and weeks late i just watch it online lol. @GABBYiSACTiVE Aw you would not unfollow me would you? Then I would cry"

In [10]:
ex_text

'@BrodyJenner if u watch the hills in london u will realise what tourture it is because were weeks and weeks late i just watch it online lol. @GABBYiSACTiVE Aw you would not unfollow me would you? Then I would cry'

In [11]:
print(sent_tokenize(ex_text))

['@BrodyJenner if u watch the hills in london u will realise what tourture it is because were weeks and weeks late i just watch it online lol.', '@GABBYiSACTiVE Aw you would not unfollow me would you?', 'Then I would cry']


In [12]:
print(word_tokenize(ex_text))

['@', 'BrodyJenner', 'if', 'u', 'watch', 'the', 'hills', 'in', 'london', 'u', 'will', 'realise', 'what', 'tourture', 'it', 'is', 'because', 'were', 'weeks', 'and', 'weeks', 'late', 'i', 'just', 'watch', 'it', 'online', 'lol', '.', '@', 'GABBYiSACTiVE', 'Aw', 'you', 'would', 'not', 'unfollow', 'me', 'would', 'you', '?', 'Then', 'I', 'would', 'cry']


In [17]:
for i in sent_tokenize(ex_text):
    print(i)

@BrodyJenner if u watch the hills in london u will realise what tourture it is because were weeks and weeks late i just watch it online lol.
@GABBYiSACTiVE Aw you would not unfollow me would you?
Then I would cry


# STOP WORDS

In [None]:
# One of the first steps to pre-processing is to utilize stop-words. 
# Stop words are words that you want to filter out of any analysis. 
# These are words that carry no meaning, or carry conflicting meanings that you simply do not want to deal with. 

In [19]:
from nltk.corpus import stopwords

In [27]:
stop_words = set(stopwords.words("english"))

In [29]:
words = word_tokenize(ex_text)

In [30]:
filtered_sentence = []

In [31]:
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

In [38]:
# Or in one line:
filtered_sentence = [w for w in words if not w in stop_words]

In [39]:
print(filtered_sentence)

['@', 'BrodyJenner', 'watch', 'hills', 'london', 'realise', 'tourture', 'weeks', 'weeks', 'late', 'watch', 'online', 'lol', '.', '@', 'GABBYiSACTiVE', 'Aw', 'would', 'unfollow', 'would', '?', 'Then', 'I', 'would', 'cry']


In [40]:
for i in filtered_sentence:
    print(i)

@
BrodyJenner
watch
hills
london
realise
tourture
weeks
weeks
late
watch
online
lol
.
@
GABBYiSACTiVE
Aw
would
unfollow
would
?
Then
I
would
cry


# STEMMING

In [None]:
# This is the process where we remove word affixes from the end of words. 
# The reason we would do this is so that we do not need to store the meaning of every single tense of a word. 
# They all have the same meaning for their "root" stem (read)

In [50]:
from nltk.stem import PorterStemmer

In [53]:
ps = PorterStemmer()

In [43]:
ex_words = ["python","pythoner","pythoning","pythoned","pythonly"]

In [58]:
for i in ex_words:
    print(ps.stem(i))

python
python
python
python
pythonli


In [59]:
words = word_tokenize(ex_text)

In [65]:
print(ps.stem(ex_text))

@brodyjenner if u watch the hills in london u will realise what tourture it is because were weeks and weeks late i just watch it online lol. @gabbyisactive aw you would not unfollow me would you? then i would cri


# PART OF SPEECH TAGGING

In [None]:
# Part of Speech (Pos) tagging does exactly what it sounds like, 
# it tags each word in a sentence with the part of speech for that word. 
# This means it labels words as noun, adjective, verb, etc. 
# PoS tagging also covers tenses of the parts of speech. 

In [68]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [82]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [83]:
c_sent_tokenizer = PunktSentenceTokenizer(ex_text)

In [84]:
tokenized = c_sent_tokenizer.tokenize(ex_text)

In [85]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
        print(str(e))
            
            

In [86]:
process_content()

[('@', 'NN'), ('BrodyJenner', 'NNP'), ('if', 'IN'), ('u', 'JJ'), ('watch', 'VBP'), ('the', 'DT'), ('hills', 'NNS'), ('in', 'IN'), ('london', 'JJ'), ('u', 'NN'), ('will', 'MD'), ('realise', 'VB'), ('what', 'WP'), ('tourture', 'NN'), ('it', 'PRP'), ('is', 'VBZ'), ('because', 'IN'), ('were', 'VBD'), ('weeks', 'NNS'), ('and', 'CC'), ('weeks', 'NNS'), ('late', 'JJ'), ('i', 'NN'), ('just', 'RB'), ('watch', 'VB'), ('it', 'PRP'), ('online', 'JJ'), ('lol', 'NN'), ('.', '.'), ('@', 'JJ'), ('GABBYiSACTiVE', 'NNP'), ('Aw', 'NNP'), ('you', 'PRP'), ('would', 'MD'), ('not', 'RB'), ('unfollow', 'VB'), ('me', 'PRP'), ('would', 'MD'), ('you', 'PRP'), ('?', '.')]
[('Then', 'RB'), ('I', 'PRP'), ('would', 'MD'), ('cry', 'VB')]


# CHUNKING

In [None]:
# Chunking in Natural Language Processing (NLP) is the process by which we group various words together by their PoS tags. 
# One of the most popular uses of this is to group things by what are called "noun phrases." 
# We do this to find the main subjects and descriptive words around them, 
# but chunking can be used for any combination of parts of speech.