**spaCy** is an open-source software library for advanced NLP written in Python and Cython

In [19]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en')
stopwords = list(STOP_WORDS)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
# using linear support vector component 
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [5]:
import string
punctuations = string.punctuation

In [20]:
# Creating a spaCy parser
from spacy.lang.en import English
parser = English()

In [17]:
import os
import pandas as pd

In [18]:
# reading the sentiment140 data
project_path = '/content/drive/MyDrive/dataset/'
train_data_path = os.path.join(project_path, 'traindata_clean.csv')

df = pd.read_csv(train_data_path, encoding='latin')

df.head()

Unnamed: 0,tweet,target
0,is upset that he can't update his facebook by ...,0
1,entity i dived many times for the ball. manage...,0
2,my whole body feels itchy and like its on fire,0
3,"entity no, it's not behaving at all. i'm mad. ...",0
4,entity not the whole crew,0


Tokenizer

In [7]:

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

Custom transformer using spaCy 

In [8]:

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

Vectorization

In [9]:
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

Using TF-IDF

In [10]:

tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [21]:
X = df['tweet']
Y = df['target']

Splitting the train and test data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Creating the  pipeline to clean, tokenize, vectorize, and classify 

In [23]:
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

Fitting the data

In [25]:
pipe.fit(X_train,Y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7fa8cc926cd0>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7fa8ba4af830>,
                                 vocabulary=None)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                

Predicting with a test dataset

In [26]:
sample_prediction = pipe.predict(X_test)

Prediction Results \\
4 = Positive review \\
0 = Negative review

In [27]:
for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
baby, please come i don't want to go but my sister wants to. Prediction=> 0
i love it how when dad walks into my room and just pulls a funny face and huffs and puffs coz its a mess and i just sit here lauging Prediction=> 4
entity thank you Prediction=> 4
entity um, yeah.  i'll reply it just in case you missed it.  pi*radius squared*height. Prediction=> 0
entity entity thank you kindly! Prediction=> 4
my daughter just started her first pokemon tournament. Prediction=> 4
entity you're too kind... and good morning Prediction=> 4
morning all! ready to get a bunch of work cranked out today.  off to a productive morning thus far. how 'bout you? Prediction=> 0
s.o.s please some1 help me itz not healthy i have insomnia but i will beat it tonight will be going back to sleep at 1 am Prediction=> 0
just woke up from a nap, am starving but not in the mood to cook. Prediction=> 0
entity sorry sweetie it is monday all day... Predictio

Accuracy

In [37]:
print("Accuracy: ", pipe.score(X_test, Y_test))
print("Accuracy: ", pipe.score(X_test, sample_prediction))
print("Accuracy: ", pipe.score(X_train, Y_train))

Accuracy:  0.7696496096060763
Accuracy:  0.9407660153020038
Accuracy:  0.8132224409239727


Create the  pipeline to clean, tokenize, vectorize, and classify using TF-IDF

In [32]:
pipe_tfid = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier)])

In [33]:
pipe_tfid.fit(X_train, Y_train)
prediction = pipe_tfid.predict(X_test)

In [34]:
for (sample, pred) in zip(X_test, prediction):
    print(sample, "Prediction =>", pred)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
baby, please come i don't want to go but my sister wants to. Prediction => 0
i love it how when dad walks into my room and just pulls a funny face and huffs and puffs coz its a mess and i just sit here lauging Prediction => 4
entity thank you Prediction => 4
entity um, yeah.  i'll reply it just in case you missed it.  pi*radius squared*height. Prediction => 0
entity entity thank you kindly! Prediction => 4
my daughter just started her first pokemon tournament. Prediction => 4
entity you're too kind... and good morning Prediction => 4
morning all! ready to get a bunch of work cranked out today.  off to a productive morning thus far. how 'bout you? Prediction => 0
s.o.s please some1 help me itz not healthy i have insomnia but i will beat it tonight will be going back to sleep at 1 am Prediction => 0
just woke up from a nap, am starving but not in the mood to cook. Prediction => 0
entity sorry sweetie it is monday all day...

Accuracy

In [36]:
print("Accuracy: ", pipe_tfid.score(X_test, Y_test))
print("Accuracy: ", pipe_tfid.score(X_test, prediction))
print("Accuracy: ", pipe_tfid.score(X_train, Y_train))

Accuracy:  0.7637144453922405
Accuracy:  1.0
Accuracy:  0.8286298839192117


### Conclusion 
SVC does not give the best result when used with TF-IDF \
However, TF-IDF does give better result over the usual tokenizer \
*TODO: Try another ML algorithm*