## Algorithm to tell the Apple company and apple the fruit

In [1]:
%matplotlib inline
import matplotlib
import seaborn as sns
sns.set()
matplotlib.rcParams['figure.dpi'] = 144

In [15]:
import spacy
nlp=spacy.load('en_core_web_sm')

## 1. Get the data

In [17]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

#function to clean the text
def wikipedia_to_sents(url):
    soup=BeautifulSoup(urlopen(url))
    #drop the references []
    def drop_refs(s):
        return ''.join(re.split('\[\d+\]', s))
    paragraphs=[drop_refs(p.text) for p in soup.find_all('p')]
    return [s.text for paragraph in paragraphs for s in nlp(paragraph).sents if len(s)>2]

In [18]:
fruit_sents=wikipedia_to_sents("http://en.wikipedia.org/wiki/Apple")
company_sents=wikipedia_to_sents("http://en.wikipedia.org/wiki/Apple_Inc.")

In [21]:
company_sents[5:10]

["Apple's software includes the macOS, iOS, iPadOS, watchOS, and tvOS operating systems, the iTunes media player, the Safari web browser, the Shazam acoustic fingerprint utility, and the iLife and iWork creativity and productivity suites, as well as professional applications like Final Cut Pro, Logic Pro, and Xcode.",
 'Its online services include the iTunes Store, the iOS App Store, Mac App Store, Apple Music, Apple TV+, iMessage, and iCloud.',
 'Other services include Apple Store, Genius Bar, AppleCare, Apple Pay, Apple Pay Cash, and Apple Card.\n',
 "Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976 to develop and sell Wozniak's Apple",
 'I personal computer, though Wayne sold his share back within 12 days.']

In [22]:
fruit_sents[5:10]

['Apple trees are large if grown from seed.  ',
 'Generally, apple cultivars are propagated by grafting onto rootstocks, which control the size of the resulting tree.',
 'There are more than 7,500 known cultivars of apples, resulting in a range of desired characteristics.',
 'Different cultivars are bred for various tastes and use, including cooking, eating raw and cider production.',
 'Trees and fruit are prone to a number of fungal, bacterial and pest problems, which can be controlled by a number of organic and non-organic means.']

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

bag_of_words_vectorizer=CountVectorizer()
counts=bag_of_words_vectorizer.fit_transform(fruit_sents+company_sents)
#row is the number of documents, column is the number of words, each value inside the matrix is the frequency
#of that word in that document
print(counts.shape)

(1063, 4691)


In [24]:
#count is a sparse matrix
print(counts.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [25]:
#show the non-zero values
print(counts)

  (0, 1379)	1
  (0, 2612)	1
  (0, 4358)	1
  (0, 737)	1
  (0, 3292)	1
  (0, 1837)	1
  (0, 1471)	1
  (0, 2335)	1
  (0, 441)	2
  (0, 392)	3
  (1, 1899)	1
  (1, 2163)	1
  (1, 3975)	1
  (1, 1982)	1
  (1, 4600)	1
  (1, 2760)	1
  (1, 4247)	2
  (1, 403)	1
  (1, 4640)	1
  (1, 1171)	1
  (1, 466)	2
  (1, 4359)	1
  (1, 2612)	1
  (1, 441)	1
  (2, 4302)	1
  :	:
  (1061, 441)	1
  (1062, 1457)	1
  (1062, 3575)	1
  (1062, 990)	1
  (1062, 223)	1
  (1062, 3498)	1
  (1062, 2190)	1
  (1062, 1342)	1
  (1062, 93)	1
  (1062, 2908)	1
  (1062, 86)	1
  (1062, 235)	1
  (1062, 308)	1
  (1062, 1760)	1
  (1062, 1827)	1
  (1062, 619)	1
  (1062, 731)	1
  (1062, 98)	1
  (1062, 4301)	1
  (1062, 4582)	1
  (1062, 2163)	1
  (1062, 4247)	2
  (1062, 403)	1
  (1062, 737)	1
  (1062, 392)	1


Another similar method is HashingVectorizer, which uses hashing map to store counts but the feature names. Faster but if you need feature names for futher use, have to use CountVectorizer.

TfidfVectorizer is better than CountVectorizer and HashingVectorizer. The last two just counts the frequency of words. But we would like to find words that are common in one document, but not common in all of them. That is the features we want.

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS

ng_tfidf=TfidfVectorizer(max_features=300, ngram_range=(1,2), 
                        stop_words=STOP_WORDS.union({'apple', 'apples','Apple'}))
ng_tfidf=ng_tfidf.fit(fruit_sents+company_sents)
print(ng_tfidf.get_feature_names()[100:105])
print(ng_tfidf.transform(fruit_sents+company_sents))

  'stop_words.' % sorted(inconsistent))


['future', 'gb', 'generally', 'generation', 'glass']
  (0, 274)	0.47762218446685034
  (0, 205)	0.5486393384553372
  (0, 165)	0.538625607528242
  (0, 99)	0.4251521846577247
  (1, 296)	0.4741198372432903
  (1, 275)	0.46428534408552163
  (1, 165)	0.5289846402305953
  (1, 108)	0.5289846402305953
  (2, 291)	0.427962505055314
  (2, 274)	0.3725660414907787
  (2, 242)	0.42015136015167814
  (2, 165)	0.42015136015167814
  (2, 96)	0.3853065567606607
  (2, 50)	0.42015136015167814
  (3, 299)	0.34810201458503676
  (3, 179)	0.4056929800488752
  (3, 108)	0.4253863131473256
  (3, 88)	0.4253863131473256
  (3, 87)	0.4056929800488752
  (3, 30)	0.433294782444718
  (4, 122)	0.6276081216589223
  (4, 88)	0.7785294121789872
  (5, 275)	0.5386411414657952
  (5, 146)	0.5772654967963395
  (5, 108)	0.6137020994985275
  :	:
  (1056, 70)	0.29366595072755247
  (1056, 67)	0.2783004379195872
  (1056, 23)	0.2620450700957848
  (1056, 3)	0.30992131855135485
  (1057, 261)	0.37677035991340574
  (1057, 240)	0.926306696451409


Stemming

In [30]:
print([w.lemma_ for w in nlp('carry carries carrying carried')])

['carry', 'carry', 'carry', 'carry']


In [32]:
#function for stemming

def tokenize_lemma(text):
    return [w.lemma_.lower() for w in nlp(text)]

#since words are stemmed, it is better to stem stopwords as well.
stop_words_lemma=set(tokenize_lemma(' '.join(STOP_WORDS))) 
ng_stem_tfidf=TfidfVectorizer(max_features=300, ngram_range=(1,2),
                             stop_words=stop_words_lemma.union({'apple'}),
                             tokenizer=tokenize_lemma)
ng_stem_tfidf=ng_stem_tfidf.fit(fruit_sents+company_sents)

ng_stem_vocab=ng_stem_tfidf.get_feature_names()
print(ng_stem_vocab)

['\n', '  ', '   billion', '   gb', '   million', '"', '" "', '" ,', '" .', '$', '%', '(', ')', ') ,', ') .', ',', ', "', ', ,', ', 2012', ', 2016', ', announce', ', company', ', feature', ', include', ', introduce', ', iphone', ', job', ', release', '-', '- -', '- generation', '.', '. \n', '1', '10', '100', '11', '12', '12 ,', '2', '2007', '2008', '2009', '2010', '2010 ,', '2011', '2011 ,', '2012', '2012 ,', '2013', '2013 ,', '2014', '2015', '2015 ,', '2016', '2016 ,', '2017', '2017 ,', '2018', '2019', '2019 ,', '2020', '3', '30', '4', '5', '6', '7', '8', ':', ';', ']', 'acquire', 'add', 'allow', 'announce', 'app', 'app store', 'application', 'april', 'august', 'base', 'begin', 'billion', 'brand', 'build', 'business', 'camera', 'campus', 'cause', 'center', 'century', 'ceo', 'change', 'china', 'claim', 'color', 'come', 'company', 'company .', 'computer', 'computer ,', 'consumer', 'continue', 'control', 'cook', 'country', 'create', 'cultivar', 'datum', 'day', 'december', 'design', 'desk

In [36]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

classifier=MultinomialNB()
pipe=Pipeline([('vectorizer',ng_stem_tfidf), ('classifier', classifier)])

In [37]:
X=fruit_sents+company_sents

In [40]:
len(X)

1063

In [41]:
y=['apple']*len(fruit_sents)+['Apple']*len(company_sents)

In [43]:
len(y)

1063

In [44]:
pipe.fit(X, y)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=300, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=T...True, vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [51]:
pipe.predict(['I like to eat apple', 
              'I like apple juice',
              'I looked up the recipe on my Apple phone', 
              'Steve Jobs is the CEO of apple', 
             'I bought some apple stocks',
             ])

array(['apple', 'Apple', 'Apple', 'Apple', 'Apple'], dtype='<U5')

Some are not correct. Because the sample size is small.