In [8]:
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import contractions
nltk.download('stopwords')
nltk.download('wordnet')

from typing import List

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\htc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\htc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [106]:
doc1 = "Neural networks process data using deep learning algorithms in artificial intelligence."
doc2 = "Artificial intelligence applies neural networks and deep learning to process large datasets."

doc3 = "Gasoline cars have combustion engines that power vehicles through fuel ignition."
doc4 = "Car engines burn gasoline in combustion chambers to move vehicles on the road."

corpus = [doc1, doc2, doc3, doc4]

In [107]:
for d in corpus:
    print(d)
    print("Length of document:", len(d))
    print()

Neural networks process data using deep learning algorithms in artificial intelligence.
Length of document: 87

Artificial intelligence applies neural networks and deep learning to process large datasets.
Length of document: 92

Gasoline cars have combustion engines that power vehicles through fuel ignition.
Length of document: 80

Car engines burn gasoline in combustion chambers to move vehicles on the road.
Length of document: 78



In [108]:
def preprocessing(text: str) -> list[str]:

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    # Convert Text to Lowercase (Normalization)
    text_lower = text.lower()
    text_no_tags = re.sub(r'<[^>]+>', '', text_lower)

    # Contraction Handling
    text_no_tags = contractions.fix(text_no_tags)

    # Removing Punctuation
    text_no_punct = re.sub(r'[^a-zA-Z\s]', '', text_no_tags) # \' for keep apostrophes (e.g. don't, it's)


    # 3. Tokens
    tokens = re.split(r"\s+", text_no_punct) 
    tokens = [t for t in tokens if t]
    # or use nltk tokenizer
    tokens = word_tokenize(text_no_punct)

    # 4. Stop word removal
    filtered_tokens  = [token for token in tokens if token not in stop_words]

    # 5. Lemmatization 
    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]
    # or stemmer
    stemm_tokens = [stemmer.stem(token) for token in filtered_tokens ]

    return lemma_tokens

preprocessed_text = [preprocessing(doc) for doc in corpus]
print(preprocessed_text)

[['neural', 'network', 'process', 'data', 'using', 'deep', 'learning', 'algorithm', 'artificial', 'intelligence'], ['artificial', 'intelligence', 'applies', 'neural', 'network', 'deep', 'learning', 'process', 'large', 'datasets'], ['gasoline', 'car', 'combustion', 'engine', 'power', 'vehicle', 'fuel', 'ignition'], ['car', 'engine', 'burn', 'gasoline', 'combustion', 'chamber', 'move', 'vehicle', 'road']]


In [109]:
all_tokens = [token for doc in preprocessed_text for token in doc]
vocab = sorted(set(all_tokens))
# print(len(all_tokens))


def TF(term, doc) :
    term = term.lower()
    print(f"term : {term}")
    print(f"doc :{doc}")
    print(f"frequency : {doc.count(term)}")
    print(f"len doc : {len(doc)}")
    return doc.count(term) / len(doc)


term = "learning"
tf = TF(term, preprocessed_text[0])
print("tf of 'learning' on doc 0: ", tf)
print("-----------------------")
tf = TF(term, preprocessed_text[1])
print("tf of 'learning' on doc 2: ", tf)

term : learning
doc :['neural', 'network', 'process', 'data', 'using', 'deep', 'learning', 'algorithm', 'artificial', 'intelligence']
frequency : 1
len doc : 10
tf of 'learning' on doc 0:  0.1
-----------------------
term : learning
doc :['artificial', 'intelligence', 'applies', 'neural', 'network', 'deep', 'learning', 'process', 'large', 'datasets']
frequency : 1
len doc : 10
tf of 'learning' on doc 2:  0.1


In [110]:
def IDF(term, corpus):
    term = term.lower()
    N = len(corpus)
    n = sum(1 for doc in corpus if term in doc)
    print(f"term : {term} \nnumber of documents : {N} \nnumber of documents containing term : {n}")

    return N/(n+1)
idf = IDF("learning", preprocessed_text)
print("idf of 'learning' : ", idf)
print("-----------------------")
idf = IDF("statistic", preprocessed_text)
print("idf of 'statistics' : ", idf)

term : learning 
number of documents : 4 
number of documents containing term : 2
idf of 'learning' :  1.3333333333333333
-----------------------
term : statistic 
number of documents : 4 
number of documents containing term : 0
idf of 'statistics' :  4.0


In [111]:
all_tokens = [token for doc in preprocessed_text for token in doc]
vocab = sorted(set(all_tokens))
print(len(all_tokens))


def TF(term: str, doc: list[str]) -> float:
    """
    Calculate Term Frequency (TF) of a term in a document.

    Args:
        term (str): The term to calculate TF for.
        doc (list[str]): The document in which to calculate TF.

    Returns:
        float: The term frequency of the term in the document.
    """
    term = term.lower()
    return doc.count(term) / len(doc)
    
def IDF(term: str, corpus: List[list[str]]) -> float:
    """
    Calculate Inverse Document Frequency (IDF) of a term in a corpus.

    Args:
        term (str): The term to calculate IDF for.
        corpus (List[list[str]]): The corpus in which to calculate IDF.

    Returns:
        float: The inverse document frequency of the term in the corpus.
    """
    N = len(corpus)
    term = term.lower()
    num_docs_with_term = sum(1 for doc in corpus if term in doc)
    return N / (1 + num_docs_with_term)

def TF_IDF(term: str, doc: list[str], corpus: List[list[str]]) -> float:
    """
    Calculate TF-IDF of a term in a document within a corpus.

    Args:
        term (str): The term to calculate TF-IDF for.
        doc (list[str]): The document in which to calculate TF-IDF.
        corpus (List[list[str]]): The corpus in which to calculate TF-IDF.

    Returns:
        float: The TF-IDF score of the term in the document.
    """
    tf = TF(term, doc)
    idf = IDF(term, corpus)
    return tf * idf



tfidf_matrix = np.zeros((len(preprocessed_text), len(vocab)))
for i, doc in enumerate(preprocessed_text):
    for j, term in enumerate(vocab):
        tfidf_matrix[i][j] = TF_IDF(term, doc, preprocessed_text)


tfidf_matrix.shape

37


(4, 25)

In [112]:
# calc similarity between documents
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)

[[1.         0.50909091 0.         0.        ]
 [0.50909091 1.         0.         0.        ]
 [0.         0.         1.         0.38984059]
 [0.         0.         0.38984059 1.        ]]


# Built in

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [doc1, doc2, doc3, doc4]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out())
print("--------------")
# Convert TF-IDF matrix to array and view it
print(X.toarray())

['advanced' 'ai' 'algorithms' 'allows' 'artificial' 'benefits' 'branch'
 'cars' 'data' 'decisions' 'designed' 'efficiency' 'electric'
 'environmental' 'features' 'include' 'intelligence' 'learn' 'learning'
 'machine' 'machines' 'make' 'patterns' 'popular' 'safety'
 'transportation' 'uses' 'vehicles']
--------------
[[0.         0.         0.         0.36222393 0.36222393 0.
  0.         0.         0.2855815  0.36222393 0.         0.
  0.         0.         0.         0.         0.36222393 0.36222393
  0.         0.         0.36222393 0.36222393 0.         0.
  0.         0.         0.         0.        ]
 [0.         0.36222393 0.36222393 0.         0.         0.
  0.36222393 0.         0.2855815  0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.36222393 0.36222393 0.         0.         0.36222393 0.
  0.         0.         0.36222393 0.        ]
 [0.37796447 0.         0.         0.         0.         0.
  0.         0.37796447 0.         0.    