Let's create the same data we used in the class.

In [1]:
#Upgrade dependencies
!pip install --upgrade pip
!pip install --upgrade nltk

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/tensorflow2_p36/lib/python3.6/site-packages (20.2.2)
Processing /home/ec2-user/.cache/pip/wheels/de/5e/42/64abaeca668161c3e2cecc24f864a8fc421e3d07a104fc8a51/nltk-3.5-py3-none-any.whl
Collecting regex
  Using cached regex-2020.7.14-cp36-cp36m-manylinux2010_x86_64.whl (660 kB)
Installing collected packages: regex, nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.4.5
    Uninstalling nltk-3.4.5:
      Successfully uninstalled nltk-3.4.5
Successfully installed nltk-3.5 regex-2020.7.14


In [2]:
texts = ["it was a clean game",
        "oil companies lost over 25 millions yesterday",
        "he scored three goals",
        "their 3 game winning streak ended yesterday",
        "The stock market started the day with profits"
]
labels = ["not_finance", "finance", "not_finance", "not_finance", "finance"]

Let's encode our labels. finance -> 0 and not_finance -> 1

In [3]:
for idx, item in enumerate(labels):
    if item == "finance":
        labels[idx] = 0
    else:
        labels[idx] = 1

## 1-Preprocess our text data

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('punkt')
nltk.download('stopwords')

snow = SnowballStemmer('english') #initialising the snowball stemmer
stop = stopwords.words('english')

def process_texts(texts):
    sentence_list = []
    for sentence in texts:
        item_list = []
        for item in sentence.split():
            if len(item)>2 and (item not in set(stop)):
                item_list.append(snow.stem(item))
        sentence_list.append(" ".join(item_list))
    return sentence_list

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
print(texts)
print(process_texts(texts))

['it was a clean game', 'oil companies lost over 25 millions yesterday', 'he scored three goals', 'their 3 game winning streak ended yesterday', 'The stock market started the day with profits']
['clean game', 'oil compani lost million yesterday', 'score three goal', 'game win streak end yesterday', 'the stock market start day profit']


## 2-Calculating Count Vectors

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

count_vectorizer = CountVectorizer()
features = count_vectorizer.fit_transform(process_texts(texts))

df = pd.DataFrame(features.toarray(), columns=count_vectorizer.get_feature_names())

print(process_texts(texts))
print(df)
print(count_vectorizer.get_feature_names())

['clean game', 'oil compani lost million yesterday', 'score three goal', 'game win streak end yesterday', 'the stock market start day profit']
   clean  compani  day  end  game  goal  lost  market  million  oil  profit  \
0      1        0    0    0     1     0     0       0        0    0       0   
1      0        1    0    0     0     0     1       0        1    1       0   
2      0        0    0    0     0     1     0       0        0    0       0   
3      0        0    0    1     1     0     0       0        0    0       0   
4      0        0    1    0     0     0     0       1        0    0       1   

   score  start  stock  streak  the  three  win  yesterday  
0      0      0      0       0    0      0    0          0  
1      0      0      0       0    0      0    0          1  
2      1      0      0       0    0      1    0          0  
3      0      0      0       1    0      0    1          1  
4      0      1      1       0    1      0    0          0  
['clean', 'compa

## 3-Fitting the Classifier

In [8]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(features, labels)

## 4-Testing

In [9]:
test_sentences = ["learn stock market playing this game"]

print("*Pre-processing")
print(process_texts(test_sentences))

print("*Vectorizing: Pay attention below, we have three non-zero terms. They correspond to our vocabulary words: stock, market and game")
test_features = count_vectorizer.transform(process_texts(test_sentences))
print(test_features.toarray())

print("*Predicting")
print("Class:", clf.predict(test_features))
print("Probabilities:", clf.predict_proba(test_features))

*Pre-processing
['learn stock market play game']
*Vectorizing: Pay attention below, we have three non-zero terms. They correspond to our vocabulary words: stock, market and game
[[0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0]]
*Predicting
Class: [1]
Probabilities: [[0.44534731 0.55465269]]


We predicted "non_finance". 