In [14]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

def preprocess_and_tokenize_advanced(text, lemmatize=True, remove_entities=False, pos_filter=None):
    """
    Preprocess and tokenize the input text with advanced options.

    Parameters:
    text (str): The input text to preprocess and tokenize.
    lemmatize (bool): If True, apply lemmatization (default: True).
    remove_entities (bool): If True, remove named entities (default: False).
    pos_filter (list): List of POS tags to keep, if provided (default: None).

    Returns:
    List[str]: A list of clean tokens.
    """
    # Convert text to lowercase
    text = text.lower()

    # Process the text using spaCy NLP pipeline
    doc = nlp(text)

    tokens = []
    for token in doc:
        # Exclude stop words, punctuation, and spaces
        if token.is_stop or token.is_punct or token.is_space:
            continue

        # Exclude named entities if specified
        if remove_entities and token.ent_type_:
            continue

        # Apply part-of-speech filtering if specified
        if pos_filter and token.pos_ not in pos_filter:
            continue

        # Apply lemmatization if enabled
        if lemmatize:
            tokens.append(token.lemma_)
        else:
            tokens.append(token.text)

    return tokens

def tokenize_and_join(text, lemmatize=True, remove_entities=False, pos_filter=None):
    """
    Tokenizes the input text and joins the tokens into a single string for vectorization.

    Returns:
    str: A string of clean tokens joined by space.
    """
    tokens = preprocess_and_tokenize_advanced(text, lemmatize, remove_entities, pos_filter)
    return " ".join(tokens)

# Sample text corpus
corpus = [
    "SNS Institutions in Coimbatore provide a wide range of educational programs related to digital technologies, including Social Networking Services (SNS).",
    "The SNS College of Technology offers specialized courses in Computer Science, Artificial Intelligence, and Social Media Analytics, focusing on practical applications in SNS platforms.",
    "With strong industry tie-ups, SNS College of Engineering in Coimbatore supports research in social media platforms and their impact on communication, marketing, and digital behavior.",
    "Coimbatore's SNS Institutions are known for their focus on innovative digital education, preparing students for careers in fields such as social media management, digital marketing, and data science."
]


# Preprocess and tokenize the corpus
processed_corpus = [tokenize_and_join(text, lemmatize=True, remove_entities=True, pos_filter=['NOUN', 'VERB']) for text in corpus]

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the corpus into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus)

# Get feature names (tokens)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert TF-IDF matrix to dense format (for readability) and print it
dense_matrix = tfidf_matrix.todense()

# Display the TF-IDF matrix and the tokens (features)
# print("TF-IDF Matrix:\n", dense_matrix)
print("Tokens (Features):\n", feature_names)


Tokens (Features):
 ['analytic' 'application' 'behavior' 'career' 'coimbatore' 'communication'
 'computer' 'course' 'datum' 'education' 'engineering' 'field' 'focus'
 'impact' 'include' 'industry' 'institution' 'intelligence' 'know'
 'management' 'marketing' 'medium' 'networking' 'offer' 'platform'
 'prepare' 'program' 'provide' 'range' 'relate' 'research' 'science'
 'service' 'sns' 'student' 'support' 'technology' 'tie' 'up']
