### Importing Necessary Libraries



In [76]:
import spacy
from spacy import displacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from pickle import dump
import string

## Loading SpaCy's small english model

To get more details regarding SpaCy models check here : https://spacy.io/usage/models

In [77]:
# Loading Spacy small model as nlp
nlp = spacy.load("en_core_web_sm")

## Gathering all the Stop words which does not convey much meaning in the Sentiment

In [78]:
# Gathering all the stopwords
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
print(len(stopwords))

326


In [79]:
# Loading yelp dataset
data_yelp = pd.read_csv('dataset/train_data.csv', header=None)
#data_yelp = pd.read_csv('dataset/all-data.csv',delimiter=',', encoding='latin-1', header=None)
data_yelp = data_yelp.rename(columns=lambda x: ['Sentiment', 'Sentence'][x])
data_yelp.head()
data_yelp['Sentiment']

0     VERY_POSITIVE
1     VERY_NEGATIVE
2     VERY_NEGATIVE
3          POSITIVE
4          POSITIVE
5          NEGATIVE
6          POSITIVE
7          NEGATIVE
8          POSITIVE
9     VERY_NEGATIVE
10    VERY_NEGATIVE
11    VERY_NEGATIVE
12         POSITIVE
13         POSITIVE
14         NEGATIVE
15    VERY_NEGATIVE
16    VERY_NEGATIVE
17    VERY_NEGATIVE
18    VERY_NEGATIVE
19         NEGATIVE
20         NEGATIVE
21    VERY_NEGATIVE
Name: Sentiment, dtype: object

In [80]:
#data_yelp['Sentiment'] = data_yelp['Sentiment'].map({'neutral':0,'positive':1,'negative':-1})
data_yelp['Sentiment'] = data_yelp['Sentiment'].map({'VERY_POSITIVE':0,'POSITIVE':1,'NEGATIVE':2,'VERY_NEGATIVE':3})

In [81]:
# data_yelp_small['Sentiment'] = [1,-1,-1,0,0,1,1,-1,0,-1]

In [82]:
# data_yelp_small.head()

In [83]:
data = data_yelp[['Sentence','Sentiment']]

In [84]:
data.head()

Unnamed: 0,Sentence,Sentiment
0,The Federal Reserve's decision to raise intere...,0
1,Many investors are optimistic about the future...,3
2,The Bank of England's decision to keep interes...,3
3,The European Central Bank's recent interest ra...,1
4,The Reserve Bank of Australia's decision to lo...,1


In [85]:
# # Adding column names to the dataframe
# columnName = ['Review','Sentiment']
# data_yelp.columns = columnName
# data_yelp.head()

## So here we can deduce that Sentiment 1 is Positive and 0 is negative

In [86]:
#print(data_yelp_small.shape)

In [87]:
# # Adding Amazon dataset and adding its column name
# data_amz = pd.read_csv("dataset/amazon_cells_labelled.txt",
#                         sep='\t', header= None)
# data_amz.columns = columnName
# data_amz.head()

In [88]:
# print(data_amz.shape)

In [89]:
# # Adding IMdB dataset and adding its column name
# data_imdb = pd.read_csv("dataset/imdb_labelled.txt",
#                         sep='\t', header= None)
# data_imdb.columns = columnName
# data_imdb.head()

In [90]:
# print(data_imdb.shape)

## Appending all the Datasets

In [91]:
# # Merging all the three dataframes
# data = data_yelp.append([data_amz, data_imdb], ignore_index=True)
# print(data.shape)

In [92]:
# Sentiment ditribution in the dataset
data.Sentiment.value_counts()

3    10
1     6
2     5
0     1
Name: Sentiment, dtype: int64

In [93]:
# Getting information regarding the null entries in the dataset
data.isnull().sum()

Sentence     0
Sentiment    0
dtype: int64

In [94]:
punct = string.punctuation
print(punct)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~




```
Here in the reviews we will find many stop words which do not add any meaning to the review.
Also punctuations will be encountered in the review which which will be considered as a seperate token by our model
So removing all the stop words and punctuation so that our model can train efficiently
```



In [95]:
def dataCleaning(sentence):
    doc = nlp(sentence)
    tokens = []
    for token in doc:
        if token.lemma_ != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    clean_tokens = []
    for token in tokens:
        if token not in punct and token not in stopwords:
            clean_tokens.append(token)
    return clean_tokens

## Here after passing a particular sentence in dataCleaning method we are returned with relevant words which contribute to the sentiments

In [96]:
dataCleaning("Today we are having heavy rainfall, We recommend you to stay at your home and be safe, Do not start running here and there")
# All the useful words are returned, no punctuations no stop words and in the lemmatized form

['today',
 'heavy',
 'rainfall',
 'recommend',
 'stay',
 'home',
 'safe',
 'start',
 'run']

In [97]:
# Spillting the train and test data
X = data['Sentence']
y = data['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
print(X_train.shape,y_test.shape)

(19,) (3,)


## Preparing Model

In [98]:
# Creating the model and pipeline
tfidf = TfidfVectorizer(tokenizer = dataCleaning)
svm = RandomForestClassifier()
steps = [('tfidf',tfidf),('svm',svm)]
pipe = Pipeline(steps)

In [99]:
# Training the model
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function dataCleaning at 0x000001601D4BEEE0>)),
                ('svm', RandomForestClassifier())])

In [100]:
# Testing on the test dataset
y_pred = pipe.predict(X_test)
y_pred

array([3, 2, 3], dtype=int64)

In [101]:
dump(pipe,open('model.pkl','wb'))

In [102]:
from pickle import load

In [103]:
X_test

19                      The collapse of Lehman Brothers
21    The Lehman Brothers bankruptcy was a black mar...
7     The recent decision by the Federal Reserve to ...
Name: Sentence, dtype: object

In [104]:
model = load(open('model.pkl','rb'))

In [105]:
# y_test
y_pred = model.predict(X_test)
y_pred

array([3, 2, 3], dtype=int64)

In [106]:
y_pred

array([3, 2, 3], dtype=int64)

In [107]:
y_test

19    2
21    3
7     2
Name: Sentiment, dtype: int64

In [108]:
accuracy_score(y_test, y_pred)

0.0

In [109]:
# # Printing the classification report and the confusion matrix
# print(classification_report(y_test,y_pred))
# print("\n\n")
# print(confusion_matrix(y_test,y_pred))

## Testing on the Random Manual Examples

**Here '1' represent that the input is positive sentiment**

In [115]:
# Testing on random inputs
int(pipe.predict(["SVB crisis: regret and blame in Silicon Valley after bank run"]))

3

In [111]:
pipe.predict_proba(["Wow you are an amazing person"])

array([[0.02, 0.16, 0.14, 0.68]])

**Here '0' represent that input is negative sentiment**

In [112]:
pipe.predict(["you suck"])

array([3], dtype=int64)

### Footnotes
https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76

https://towardsdatascience.com/a-simple-example-of-pipeline-in-machine-learning-with-scikit-learn-e726ffbb6976


In [113]:
def get_sentiment(pipe,sentence):
    prob_list = pipe.predict_proba([sentence]).flatten()
    sent = {}
    very_pos_prob = prob_list[0]
    pos_prob = prob_list[1]
    neg_prob = prob_list[2]
    very_neg_prob = prob_list[3]
    sum_pos_prob = very_pos_prob + pos_prob
    sum_neg_prob = very_neg_prob + neg_prob
    if sum_pos_prob > sum_neg_prob:
        sent['Confidence'] = sum_pos_prob
        if very_pos_prob > pos_prob:
            sent['Sentiment'] = 'VERY POSITIVE'
        else:
            sent['Sentiment'] = 'POSITIVE'
    elif sum_pos_prob < sum_neg_prob:
        sent['Confidence'] = sum_neg_prob
        if very_neg_prob > neg_prob:
            sent['Sentiment'] = 'VERY NEGATIVE'
        else:
            sent['Sentiment'] = 'NEGATIVE'
    else:
        sent['Sentiment'] = 'NEUTRAL'
        
    return sent

In [114]:
get_sentiment(pipe,"Wow you are an amazing person")

{'Confidence': 0.8200000000000001, 'Sentiment': 'VERY NEGATIVE'}