In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample sentences
sentences = [
    "AI is transforming industries",
    "AI is used in healthcare",
    "Deep learning is part of AI",
    "AI and healthcare are transforming the world",
    "Another field is finance"
]

# Initialize the vectorizer with custom parameters
vectorizer = TfidfVectorizer(
    max_df=0.85,    # Ignore words appearing in more than 85% of documents
    min_df=1,       # Words must appear in at least one document
    ngram_range=(1, 2),  # Include unigrams and bigrams 
    stop_words='english' # Remove common English stopwords
)

# Fit and transform the corpus
X = vectorizer.fit_transform(sentences)

# Convert to a dense array (if necessary) and print the result
print(X.toarray())

[[0.28281359 0.         0.50199209 0.         0.         0.
  0.         0.         0.         0.         0.         0.50199209
  0.         0.         0.40500406 0.50199209 0.         0.
  0.         0.        ]
 [0.28281359 0.         0.         0.50199209 0.         0.
  0.         0.         0.         0.40500406 0.         0.
  0.         0.         0.         0.         0.         0.50199209
  0.50199209 0.        ]
 [0.27113917 0.         0.         0.         0.48127008 0.48127008
  0.         0.         0.         0.         0.         0.
  0.48127008 0.48127008 0.         0.         0.         0.
  0.         0.        ]
 [0.23766483 0.42185336 0.         0.         0.         0.
  0.         0.         0.         0.34034864 0.42185336 0.
  0.         0.         0.34034864 0.         0.42185336 0.
  0.         0.42185336]
 [0.         0.         0.         0.         0.         0.
  0.57735027 0.57735027 0.57735027 0.         0.         0.
  0.         0.         0.         0

In [2]:
# Show the feature names (vocabulary)
print(vectorizer.get_feature_names_out())


['ai' 'ai healthcare' 'ai transforming' 'ai used' 'deep' 'deep learning'
 'field' 'field finance' 'finance' 'healthcare' 'healthcare transforming'
 'industries' 'learning' 'learning ai' 'transforming'
 'transforming industries' 'transforming world' 'used' 'used healthcare'
 'world']


In [3]:
# Show the IDF values (optional)
print(vectorizer.idf_)

[1.18232156 2.09861229 2.09861229 2.09861229 2.09861229 2.09861229
 2.09861229 2.09861229 2.09861229 1.69314718 2.09861229 2.09861229
 2.09861229 2.09861229 1.69314718 2.09861229 2.09861229 2.09861229
 2.09861229 2.09861229]
