In [49]:
import pandas as pd

# Create a DataFrame with simple text data
data = {
    'content': [
        "apple banana cherry durian elderberry fig.",
        "apple banana durian cherry elderberry cherry.",
        "cherry grape honeydew ice apple cherry.",
        "apple banana cherry durian grape ice"
    ]
}

docs = pd.DataFrame(data)['content'][:2]




a4 b3 c6 d3 e2 f1 g2 h1 i2

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

In [51]:
print(vectorizer.get_feature_names_out())


['apple' 'banana' 'cherry' 'durian' 'elderberry' 'fig']


In [52]:
print(X.toarray())

[[1 1 1 1 1 1]
 [1 1 2 1 1 0]]


In [53]:
for ngram_range in [(1, 1), (1, 2), (2, 2), (2, 3), (3, 3)]:
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(docs)
    print(f"ngram_range={ngram_range}:")
    print("Feature Names:", vectorizer.get_feature_names_out())
    print("Tokens:")
    print(X.toarray())
    print("Number of tokens:", X.toarray().shape[1])
    print("=" * 50)

ngram_range=(1, 1):
Feature Names: ['apple' 'banana' 'cherry' 'durian' 'elderberry' 'fig']
Tokens:
[[1 1 1 1 1 1]
 [1 1 2 1 1 0]]
Number of tokens: 6
ngram_range=(1, 2):
Feature Names: ['apple' 'apple banana' 'banana' 'banana cherry' 'banana durian' 'cherry'
 'cherry durian' 'cherry elderberry' 'durian' 'durian cherry'
 'durian elderberry' 'elderberry' 'elderberry cherry' 'elderberry fig'
 'fig']
Tokens:
[[1 1 1 1 0 1 1 0 1 0 1 1 0 1 1]
 [1 1 1 0 1 2 0 1 1 1 0 1 1 0 0]]
Number of tokens: 15
ngram_range=(2, 2):
Feature Names: ['apple banana' 'banana cherry' 'banana durian' 'cherry durian'
 'cherry elderberry' 'durian cherry' 'durian elderberry'
 'elderberry cherry' 'elderberry fig']
Tokens:
[[1 1 0 1 0 0 1 0 1]
 [1 0 1 0 1 1 0 1 0]]
Number of tokens: 9
ngram_range=(2, 3):
Feature Names: ['apple banana' 'apple banana cherry' 'apple banana durian'
 'banana cherry' 'banana cherry durian' 'banana durian'
 'banana durian cherry' 'cherry durian' 'cherry durian elderberry'
 'cherry elderberry'

In [54]:
vectorizer_min_df_1 = CountVectorizer(min_df=1)
X_min_df_1 = vectorizer_min_df_1.fit_transform(docs)

# Initialize CountVectorizer with min_df=2
vectorizer_min_df_2 = CountVectorizer(min_df=2)
X_min_df_2 = vectorizer_min_df_2.fit_transform(docs)

# Number of tokens with min_df=1
num_tokens_min_df_1 = X_min_df_1.shape[1]

# Number of tokens with min_df=2
num_tokens_min_df_2 = X_min_df_2.shape[1]

# Initialize CountVectorizer with max_df=1
vectorizer_max_df_1 = CountVectorizer(max_df=1)
X_max_df_1 = vectorizer_max_df_1.fit_transform(docs)

# Initialize CountVectorizer with max_df=2
vectorizer_max_df_2 = CountVectorizer(max_df=2)
X_max_df_2 = vectorizer_max_df_2.fit_transform(docs)

# Number of tokens with max_df=1
num_tokens_max_df_1 = X_max_df_1.shape[1]

# Number of tokens with max_df=2
num_tokens_max_df_2 = X_max_df_2.shape[1]

print(f"Number of tokens with min_df=1: {num_tokens_min_df_1}")
print(f"Number of tokens with min_df=2: {num_tokens_min_df_2}")
print(f"Number of tokens with max_df=1: {num_tokens_max_df_1}")
print(f"Number of tokens with max_df=2: {num_tokens_max_df_2}")

Number of tokens with min_df=1: 6
Number of tokens with min_df=2: 5
Number of tokens with max_df=1: 1
Number of tokens with max_df=2: 6


In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None,min_df=2)
X = vectorizer.fit_transform(docs)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['apple' 'banana' 'cherry' 'durian' 'elderberry']
[[1. 1. 1. 1. 1.]
 [1. 1. 2. 1. 1.]]


In [56]:
# Get the feature names (tokens)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense array for easier manipulation
tfidf_matrix = X.toarray()

# Find the index of the token with the highest TF-IDF value in each document
max_tfidf_indices = tfidf_matrix.argmax(axis=1)

# Get the token with the largest TF-IDF value in each document
for doc_index, max_index in enumerate(max_tfidf_indices):
    token = feature_names[max_index]
    tfidf_value = tfidf_matrix[doc_index, max_index]
    print(f"Document {doc_index + 1}: Token '{token}' has the largest TF-IDF value of {tfidf_value:.2f}")

Document 1: Token 'apple' has the largest TF-IDF value of 1.00
Document 2: Token 'cherry' has the largest TF-IDF value of 2.00


In [57]:
"""import numpy as np
docs = pd.DataFrame(data)['content'][:2]
vectorizer = TfidfVectorizer(stop_words = 'english', max_df=0.2)
X = vectorizer.fit_transform(docs)
indices = np.arange(docs.size)
from sklearn.model_selection import train_test_split
train, test = train_test_split(indices, test_size=0.2)
from sklearn.metrics.pairwise import cosine_distances
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=3,metric=cosine_distances).fit(X[train])
test=[test[0]]
found = nbrs.kneighbors(X[test], return_distance=False)
test_i=0
print('text:\n%.300s'%docs[test[test_i]])
for i in found[0]:
    print('match %d:\n%.300s'%(i,docs[train[i]]))"""

"import numpy as np\ndocs = pd.DataFrame(data)['content'][:2]\nvectorizer = TfidfVectorizer(stop_words = 'english', max_df=0.2)\nX = vectorizer.fit_transform(docs)\nindices = np.arange(docs.size)\nfrom sklearn.model_selection import train_test_split\ntrain, test = train_test_split(indices, test_size=0.2)\nfrom sklearn.metrics.pairwise import cosine_distances\nfrom sklearn.neighbors import NearestNeighbors\nnbrs = NearestNeighbors(n_neighbors=3,metric=cosine_distances).fit(X[train])\ntest=[test[0]]\nfound = nbrs.kneighbors(X[test], return_distance=False)\ntest_i=0\nprint('text:\n%.300s'%docs[test[test_i]])\nfor i in found[0]:\n    print('match %d:\n%.300s'%(i,docs[train[i]]))"

In [58]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_distances
from sklearn.neighbors import NearestNeighbors



docs = pd.DataFrame(data)['content'][:2]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(docs)

indices = np.arange(docs.size)
train, test = train_test_split(indices, test_size=0.2)

nbrs = NearestNeighbors(n_neighbors=1, metric=cosine_distances).fit(X[train])
test = [test[0]]
found = nbrs.kneighbors(X[test], return_distance=False)

test_i = 0
print('text:\n%.300s' % docs[test[test_i]])
for i in found[0]:
    print('match %d:\n%.300s' % (i, docs[train[i]]))


text:
apple banana durian cherry elderberry cherry.
match 0:
apple banana cherry durian elderberry fig.
