In [1]:
# utility functions for advanced text preprocessing 
# initial feature engineering function
def feature_engineering(df):
    """
    Create features for NLP tasks.
    """
    df['entity_count'] = df['entities'].apply(len)
    df['sentiment_score'] = df['sentiment']
    df['text_length'] = df['text'].apply(len)
    df['unique_pos_count'] = df['pos_tags'].apply(lambda x: len(set(x)))
    df['sentence_complexity'] = df['dep_parse'].apply(lambda x: len(set(x)) / len(x) if x else 0)
    df['vocab_diversity'] = df['processed_text'].apply(lambda x: len(set(x.split())) / len(x.split()) if x.split() else 0)
    def count_entity_type(entities, entity_type):
        return sum(1 for entity in entities if entity[1] == entity_type)
    df['product_entity_count'] = df['entities'].apply(lambda x: count_entity_type(x, 'PRODUCT'))

    return df

# visualization of dependecy parse
def visualize_dependency_parsing(text):
    """
    Visualize the dependency parse of a given text.
    """
    doc = nlp(text)
    spacy.displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

# function for analyzing dependency parsing and POST tagging. 
def analyze_dependency_pos(df):
    """
    Analyze dependency parsing and POS tagging.
    """
    pos_counts = df['pos_tags'].explode().value_counts()
    dep_counts = df['dep_parse'].explode().value_counts()

    return pos_counts, dep_counts

# top_entities = most_frequent_entities(df_with_advanced_analysis)
# print("Most Frequent Entities:", top_entities)
def feature_engineering(df):
    """
    Enhanced feature engineering for NLP tasks.
    """
    # Entity Count
    df['entity_count'] = df['entities'].apply(len)
    # Sentiment Score as a Feature
    df['sentiment_score'] = df['sentiment']
    # Text Length Feature
    df['text_length'] = df['text'].apply(len)
    df['unique_pos_count'] = df['pos_tags'].apply(lambda x: len(set(x)))

    # Specific Entity Type Count (e.g., PRODUCT, ORG)
    def count_entity_type(entities, entity_type):
        return sum(1 for entity in entities if entity[1] == entity_type)
    df['product_entity_count'] = df['entities'].apply(lambda x: count_entity_type(x, 'PRODUCT'))

    # Additional features can be added here based on specific requirements
    return df

def refined_feature_engineering(df):
    """
    Add more sophisticated features related to text complexity and diversity.
    """
    df['sentence_complexity'] = df['dep_parse'].apply(lambda x: len(set(x)) / len(x) if x else 0)
    df['vocab_diversity'] = df['processed_text'].apply(lambda x: len(set(x.split())) / len(x.split()) if x.split() else 0)

    return df


# Visualization Function for Dependency Parsing
def visualize_dependency_parsing(text):
    """
    Visualize the dependency parse of a given text using SpaCy.
    """
    doc = nlp(text)
    spacy.displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

def analyze_dependency_pos(df):
    """
    Perform in-depth analysis of dependency parsing and POS tagging.
    """
    # Example: Counting the frequency of different POS tags
    pos_counts = df['pos_tags'].explode().value_counts()
    dep_counts = df['dep_parse'].explode().value_counts()

    return pos_counts, dep_counts

from gensim import corpora, models

def perform_topic_modeling(df, num_topics=5, num_words=5):
    """
    Perform topic modeling on the dataset.
    """
    text_data = [text.split() for text in df['processed_text']]
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    
    topics = lda.print_topics(num_words=num_words)
    for topic in topics:
        print(topic)

    return lda

# Example usage
lda_model = perform_topic_modeling(df_cleaned)

import plotly.express as px


# top_entities = most_frequent_entities(df_with_advanced_analysis)
# print("Most Frequent Entities:", top_entities)


NameError: name 'df_cleaned' is not defined