In [2]:
pip install nltk scikit-learn




In [3]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# Download NLTK data files (only need to run once)
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
def preprocess_text(text):
    # Tokenize the text into words
    words = word_tokenize(text.lower())

    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]

    return words

def extract_keywords(text, num_keywords=10):
    # Preprocess the text
    words = preprocess_text(text)

    # Join words back to a single string for TF-IDF processing
    processed_text = ' '.join(words)

    # Vectorize the text using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([processed_text])

    # Extract the TF-IDF scores and feature names
    scores = tfidf_matrix.toarray()[0]
    feature_names = vectorizer.get_feature_names_out()

    # Get the top keywords based on TF-IDF scores
    keyword_indices = scores.argsort()[-num_keywords:][::-1]
    keywords = [feature_names[i] for i in keyword_indices]

    return keywords


In [5]:
# Example text (you can replace this with any article or blog text)
text = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.
Colloquially, the term "artificial intelligence" is often used to describe machines (or computers) that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem-solving".
"""

# Extract keywords
keywords = extract_keywords(text, num_keywords=5)
print("Keywords:", keywords)


Keywords: ['intelligence', 'machines', 'ai', 'humans', 'artificial']
