# Keyword Extraction
## Using [TF-IDF](https://towardsdatascience.com/keyword-extraction-python-tf-idf-textrank-topicrank-yake-bert-7405d51cd839)

In [73]:
import string
import nltk

### Download stopwords and punctuation

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/elias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/elias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def get_stopwords(*languages):
    return [sw for lang in languages for sw in nltk.corpus.stopwords.words(lang)]

In [None]:
stop_words = get_stopwords("english", "german")
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
def preprocess_text(text, stop_words):
    # tokenize
    tokens = nltk.tokenize.word_tokenize(text)
    # remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # remove stopwords and make lowercase
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    return tokens

### Read PDF
Adopted from pinecone_test.ipynb

In [76]:
from langchain.document_loaders import PyPDFLoader

In [78]:
def pdf_to_str(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return ' '.join(doc.page_content for doc in documents)

In [80]:
text = pdf_to_str("test_pdf/3_ProjectManagement.pdf")
print(text[:300])

Software Engineering for AI-
Enabled Systems
Prof. Dr.-Ing. Norbert Siegmund
Software Systems
 Topic I: 
Why Project Management
2 Project Management of AI Systems
TL;DR: 
- Scoping the project
- Setting the g oal of the project, product, or system to be developed
- Roles & team management
- Project 


In [81]:
tokens = preprocess_text(text, stop_words)
tokens[:100]

['software',
 'engineering',
 'ai-',
 'enabled',
 'systems',
 'prof.',
 'dr.-ing',
 'norbert',
 'siegmund',
 'software',
 'systems',
 'topic',
 'project',
 'management',
 '2',
 'project',
 'management',
 'ai',
 'systems',
 'tl',
 'dr',
 'scoping',
 'project',
 'setting',
 'g',
 'oal',
 'project',
 'product',
 'system',
 'developed',
 'roles',
 'team',
 'management',
 'project',
 'life',
 'cycle',
 '3',
 'ai',
 'componentsoftware',
 'system',
 'specification',
 'architecture',
 'designdevelopmentdebugging',
 'testing',
 'deployment',
 'monitoring',
 'traditional',
 'software',
 'developmentcollection',
 'labelling',
 'preprocessing',
 'versioning',
 'storagemodel',
 'selection',
 'training',
 'hyperparameter',
 'debuggingdata',
 'manage',
 'mentexperi',
 'mentation',
 'roles',
 'tasks',
 'processesarchetypes',
 'life',
 'cycle',
 'metricsfeedback',
 'flywheel',
 'project',
 'management',
 'ai',
 'p',
 'roject',
 'archetypes',
 'ml',
 'model',
 'types',
 'exist',
 'realize',
 'business',

### Perform Keyword Analysis

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [83]:
def get_keyword_scores(tokens):
    """
    Returns a dict mapping each unique token to its TF-IDF score
    """
    # Join tokens back into a string
    text = ' '.join(tokens)
    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    # Compute TF-IDF scores
    tfidf_matrix = vectorizer.fit_transform([text])
    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()
    # Create a dictionary of token to TF-IDF score
    keyword_scores = {}
    for col in tfidf_matrix.nonzero()[1]:
        keyword_scores[feature_names[col]] = tfidf_matrix[0, col]
    return keyword_scores

In [84]:
def get_top_n_keywords(text, n, stop_words):
    tokens = preprocess_text(text, stop_words)
    kw_scores = get_keyword_scores(tokens)
    return [item[0] for item in sorted(kw_scores.items(), key=lambda item: item[1], reverse=True)][:n]

In [85]:
get_top_n_keywords(text, 10, stop_words)

['data',
 'ml',
 'model',
 'team',
 'project',
 'product',
 'teams',
 'development',
 'ai',
 'software']

In [86]:
text2 = pdf_to_str("test_pdf/3_RequirementsEngineering.pdf")

In [87]:
get_top_n_keywords(text2, 10, stop_words)

['data',
 'requirements',
 'ai',
 'model',
 'specification',
 'system',
 'specify',
 'learning',
 'etc',
 'define']

##### Freed code in keyword_extraction.py