In [2]:
# ! pip install spacy

# Why Spacy?
- NLTK is used to clean the data and preprocess it. 
- Spacy is another tool in our arsenal that is designed for fast and practical work, to avoid wasting time on NLP projects

In [5]:
# download the English language model
# ! python -m spacy download en_core_web_sm

In [26]:
import spacy
import pandas as pd

In [8]:
nlp = spacy.load("en_core_web_sm")

# Pre Processing

## 1. Tokenization

In [72]:
dummy_text = 'When Michelangelo was asked how he could sculpt a work of art as masterful as his \
David, he is famously reported to have said, “It is easy. You just chip away the stone \
that doesn’t look like David."'

In [73]:
# Change to spaCy document with linguistic annotations
my_doc = nlp(dummy_text)
my_doc

When Michelangelo was asked how he could sculpt a work of art as masterful as his David, he is famously reported to have said, “It is easy. You just chip away the stone that doesn’t look like David."

In [74]:
print(type(my_doc))

<class 'spacy.tokens.doc.Doc'>


In [75]:
# Tokenize the dataset
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab) # word tokenizer

In [76]:
tokens = tokenizer(my_doc.text)
list(tokens)[:7]

[When, Michelangelo, was, asked, how, he, could]

## 2. Removing Stop Words

In [77]:
# import stop_words
from spacy.lang.en.stop_words import STOP_WORDS

In [78]:
list(STOP_WORDS)[:10]

['their',
 'beyond',
 "'ve",
 'a',
 "'ll",
 'under',
 'yourselves',
 'at',
 'any',
 'how']

In [79]:
filtered_sent = []
found_stop_words = []
for word in my_doc:
    if word.is_stop: # helpful attribute
        found_stop_words.append(word)
    else:
        filtered_sent.append(word)

In [80]:
from itertools import zip_longest
data = list(zip_longest(filtered_sent, found_stop_words, fillvalue=None))
pd.DataFrame(data, columns=["Filtered words", "Found Stop Words"])

Unnamed: 0,Filtered words,Found Stop Words
0,Michelangelo,When
1,asked,was
2,sculpt,how
3,work,he
4,art,could
5,masterful,a
6,David,of
7,",",as
8,famously,as
9,reported,his


## 3. Remove Punctuation, by looking at part of speech `pos_`==`PUNCT`

In [81]:
len(my_doc)

45

In [82]:
my_words = [word for word in my_doc if word.pos_ != "PUNCT"]
len(my_words)

39

## 4. Lemmatization

In [83]:
# .lemma_ to reduce a word to its lemma form
my_words = [{"original": o, "lemma": o.lemma_} for o in my_words]
pd.DataFrame(my_words)

Unnamed: 0,original,lemma
0,When,when
1,Michelangelo,Michelangelo
2,was,be
3,asked,ask
4,how,how
5,he,he
6,could,could
7,sculpt,sculpt
8,a,a
9,work,work


- pronouns like you won't be reduced to any form so they'll just return the same word

## Create a Preprocess function

In [84]:
import string  # for punctuations
from spacy.lang.en.stop_words import STOP_WORDS  # removing stop words

punctuations = string.punctuation

nlp = spacy.load("en_core_web_sm")  # english model for spacy
stop_words = spacy.lang.en.stop_words.STOP_WORDS  # stop words

In [112]:
def preprocess(text):
    # removing the digits
    nonum = "".join([i for i in text if not i.isdigit()])

    # Creating our token object, which is used to create documents with
    # linguistic annotations
    tokenized_list = nlp(nonum)

    #  remove stop words
    tokenized_list = [word for word in tokenized_list
        if not word.is_stop
        # if word not in stop_words
    ]

    # Lemmatizing each token and converting each token into lowercase
    # if word.lemma_ != "-PRON" this doesn't happen in new spaCy
    tokenized_list = [
        word.lemma_.lower().strip()
        for word in tokenized_list
        # if word.pos_ != "PUNCT" # remove punctuation
    ]
    
    tokenized_list = [word for word in tokenized_list if word not in punctuations]

    # return a preprocessed list of token
    return tokenized_list

In [113]:
preprocess("Hello1 ! I am a good boy")

['hello', 'good', 'boy']

# Transform text to ML usuable format (i.e numbers)
> After we are done with preprocessing, nect step is going to be generating a sparse matric using CountVectorizer and then TF-IDF using sklearn

`from sklearn.feature_extraction.text import TfidfTransformer`
- this only converts a sparse matrix to its tf-idf format.
- we did this in the basics

In [116]:
# This one has CountVectorizer included
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [117]:
classifier = MultinomialNB()
tfidf_vector = TfidfVectorizer(tokenizer=preprocess)

In [118]:
# Create Pipeline
pipe = Pipeline([
    ("tfidf", tfidf_vector),
    ("classifier", classifier)
])

# Model Training

In [119]:
import numpy as np
import pandas as pd

In [120]:
# load the data
train_data = pd.read_csv("train.tsv", delimiter="\t", index_col="PhraseId")

In [122]:
# model generation
pipe.fit(train_data["Phrase"], train_data["Sentiment"])



In [123]:
# check the accuracy of the data
pipe.score(train_data["Phrase"], train_data["Sentiment"])

0.623119313084711

In [124]:
# evaluate further with classification_report
from sklearn.metrics import classification_report

pred = pipe.predict(train_data["Phrase"])
print(classification_report(y_true=train_data["Sentiment"], y_pred=pred))

              precision    recall  f1-score   support

           0       0.77      0.08      0.14      7072
           1       0.61      0.31      0.41     27273
           2       0.63      0.92      0.75     79582
           3       0.58      0.42      0.49     32927
           4       0.76      0.09      0.15      9206

    accuracy                           0.62    156060
   macro avg       0.67      0.36      0.39    156060
weighted avg       0.63      0.62      0.57    156060

