In [1]:
import spacy
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

nlp = spacy.load("en_core_web_sm")

def extract_features(bio):
    text = bio.lower()
    
    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = nltk.word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatized_tokens = [nlp(word)[0].lemma_ for word in tokens]
    
    doc = nlp(bio)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    themes = set(lemmatized_tokens)
    
    skills = set(lemmatized_tokens).intersection({'programming', 'coding', 'building', 'creating', 'innovating'})
    
    return {
        'keywords': lemmatized_tokens,
        'entities': entities,
        'themes': list(themes),
        'skills': list(skills),
    }


mentor_bio = "i love programming and building innovative solutions."
mentee_bio = "programming interests me, and I enjoy creating new things."

mentor_features = extract_features(mentor_bio)
mentee_features = extract_features(mentee_bio)

print("mentor features:")
print(mentor_features)
print("mentee features:")
print(mentee_features)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varuniyer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


mentor features:
{'keywords': ['love', 'programming', 'build', 'innovative', 'solution'], 'entities': [], 'themes': ['innovative', 'build', 'solution', 'programming', 'love'], 'skills': ['programming'], 'sentiment': 'positive'}
mentee features:
{'keywords': ['programming', 'interest', 'enjoy', 'create', 'new', 'thing'], 'entities': [], 'themes': ['enjoy', 'interest', 'create', 'thing', 'programming', 'new'], 'skills': ['programming'], 'sentiment': 'positive'}
