In [1]:
import nltk
from nltk.stem import WordNetLemmatizer #This "lemmatizes" the words, aka removes stems (ex: running becomes run)

In [2]:
nltk.download('popular', quiet=True) #downloads packages

True

In [8]:
#Example sentence (single quote)
paragraph = "The University of California, Davis (UC Davis, UCD, or Davis) is a public land-grant research university near Davis, California.[11] Named a Public Ivy,[12] it is the northernmost of the ten campuses of the University of California system. The institution was first founded as an agricultural branch of the system in 1905 and became the seventh campus of the University of California in 1959."

In [9]:
#Example sentence (notice the triple quotes)
paragraph = """The University of California, Davis (UC Davis, UCD, or Davis) is a public land-grant research university near 
Davis, California.[11] Named a Public Ivy,[12] it is the northernmost of the ten campuses of the University of California 
system. The institution was first founded as an agricultural branch of the system in 1905 and became the seventh campus of 
the University of California in 1959."""

In [10]:
print(paragraph)

The University of California, Davis (UC Davis, UCD, or Davis) is a public land-grant research university near 
Davis, California.[11] Named a Public Ivy,[12] it is the northernmost of the ten campuses of the University of California 
system. The institution was first founded as an agricultural branch of the system in 1905 and became the seventh campus of 
the University of California in 1959.


In [11]:
#Tokenizing splits either words or sentences into individual elements
sent_tokens = nltk.sent_tokenize(paragraph)# converts to list of sentences 
word_tokens = nltk.word_tokenize(paragraph)# converts to list of words

In [14]:
sent_tokens

['The University of California, Davis (UC Davis, UCD, or Davis) is a public land-grant research university near \nDavis, California.',
 '[11] Named a Public Ivy,[12] it is the northernmost of the ten campuses of the University of California \nsystem.',
 'The institution was first founded as an agricultural branch of the system in 1905 and became the seventh campus of \nthe University of California in 1959.']

In [17]:
#Easy to read format
for sentence in sent_tokens:
    print(sentence)
    print('-----------------------------------------------------------------------------------------------------')

The University of California, Davis (UC Davis, UCD, or Davis) is a public land-grant research university near 
Davis, California.
-----------------------------------------------------------------------------------------------------
[11] Named a Public Ivy,[12] it is the northernmost of the ten campuses of the University of California 
system.
-----------------------------------------------------------------------------------------------------
The institution was first founded as an agricultural branch of the system in 1905 and became the seventh campus of 
the University of California in 1959.
-----------------------------------------------------------------------------------------------------


In [15]:
word_tokens

['The',
 'University',
 'of',
 'California',
 ',',
 'Davis',
 '(',
 'UC',
 'Davis',
 ',',
 'UCD',
 ',',
 'or',
 'Davis',
 ')',
 'is',
 'a',
 'public',
 'land-grant',
 'research',
 'university',
 'near',
 'Davis',
 ',',
 'California',
 '.',
 '[',
 '11',
 ']',
 'Named',
 'a',
 'Public',
 'Ivy',
 ',',
 '[',
 '12',
 ']',
 'it',
 'is',
 'the',
 'northernmost',
 'of',
 'the',
 'ten',
 'campuses',
 'of',
 'the',
 'University',
 'of',
 'California',
 'system',
 '.',
 'The',
 'institution',
 'was',
 'first',
 'founded',
 'as',
 'an',
 'agricultural',
 'branch',
 'of',
 'the',
 'system',
 'in',
 '1905',
 'and',
 'became',
 'the',
 'seventh',
 'campus',
 'of',
 'the',
 'University',
 'of',
 'California',
 'in',
 '1959',
 '.']

In [31]:
#We will be importing some new packages
import string

#Vectorizer converts two texts into a vector. In this example, we will be using CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#This calculates the angle between a vector. It is used to detect similarity
from sklearn.metrics.pairwise import cosine_similarity

#Stopwords include articles and pronouns
from nltk.corpus import stopwords

In [32]:
#extract stopwords
stopwords = stopwords.words('english')

In [38]:
#List of sentences we would like to compare
sentences = [
    'The javelin throw was added to the Ancient Olympic Games as part of the pentathlon in 708 BC. ',
    'It included two events, one for distance and the other for accuracy in hitting a target.',
    'Throwing javelin-like poles into targets was revived in Germany and Sweden in the early 1870s.',
    'In Sweden, these poles developed into the modern javelin, and throwing them for distance became a common event there and in Finland in the 1880s.'
]

In [39]:
#creates a function to reformat the string.
def format_string(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords])
    
    return text

In [40]:
#can use a for loop, but there is already a map function.
format_list = list(map(format_string, sentences))
format_list

['javelin throw added ancient olympic games part pentathlon 708 bc',
 'included two events one distance accuracy hitting target',
 'throwing javelinlike poles targets revived germany sweden early 1870s',
 'sweden poles developed modern javelin throwing distance became common event finland 1880s']

In [41]:
#creates the vectors
count_vectorizer = CountVectorizer().fit_transform(format_list)
count_vectors = count_vectorizer.toarray()
count_vectors

array([[0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0]], dtype=int64)

In [42]:
#Creates the similarity and outputs it to a matrix
similarity = cosine_similarity(count_vectors)
similarity

array([[1.        , 0.        , 0.        , 0.09128709],
       [0.        , 1.        , 0.        , 0.10206207],
       [0.        , 0.        , 1.        , 0.28867513],
       [0.09128709, 0.10206207, 0.28867513, 1.        ]])