In [1]:
import pandas as pd

# Assuming 'df' is your DataFrame with 'tags' column
df = pd.DataFrame({
    'tags': ['Developers', 'Graphic Designer', 'Mobile App Developer', 'Interface Designer', 'Developer', 'Director', 'Dirctor', 'CEO', 'Backend Developer', 'Testers, QA'],
    'input': ['John Smith is an experienced web developer with expertise in HTML, CSS, and JavaScript', 'Sarah Johnson is a talented graphic designer specializing in Adobe Creative Suite', 'Detail-oriented Mobile App Developer with experience in iOS and Android development', 'An Interface Designer is a skilled professional who designs user interfaces for software applications', 'Develop and maintain user interfaces (UI) for web applications', 'Director Duties and Responsibilities - Develop and implement strategies for company growth', 'Director Duties and Responsibilities - Develop and implement strategies for company growth', 'The chief executive officer (CEO) is the highest-ranking executive in a company', 'JavaScript: The high-level, platform-independent programming language for web development', 'The required minimum for novice test engineers in a software development environment'],
})

# Function to clean and process tags
def process_tags(tags):
    # Replace any unwanted characters and split by commas
    cleaned_tags = tags.replace('+ACI-', '').split(',')
    
    # Remove leading and trailing whitespaces from each tag
    cleaned_tags = [tag.strip() for tag in cleaned_tags]

    # Remove empty tags
    cleaned_tags = [tag for tag in cleaned_tags if tag]

    # Return unique tags as a comma-separated string
    unique_tags = ', '.join(set(cleaned_tags)) if cleaned_tags else None

    return unique_tags

# Apply the function to the 'tags' column
df['tags'] = df['tags'].apply(process_tags)

# Display the updated DataFrame
print(df)

                   tags                                              input
0            Developers  John Smith is an experienced web developer wit...
1      Graphic Designer  Sarah Johnson is a talented graphic designer s...
2  Mobile App Developer  Detail-oriented Mobile App Developer with expe...
3    Interface Designer  An Interface Designer is a skilled professiona...
4             Developer  Develop and maintain user interfaces (UI) for ...
5              Director  Director Duties and Responsibilities - Develop...
6               Dirctor  Director Duties and Responsibilities - Develop...
7                   CEO  The chief executive officer (CEO) is the highe...
8     Backend Developer  JavaScript: The high-level, platform-independe...
9           Testers, QA  The required minimum for novice test engineers...


ParserError: Error tokenizing data. C error: Expected 3 fields in line 7, saw 4


In [3]:
import io
import csv
import pandas as pd
import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

def generate_tags(text):
    doc = nlp(text)
    spacy_tags = [ent.text for ent in doc.ents]
    
    # If SpaCy tags are empty, fallback to simple word tokenization
    tags = spacy_tags if spacy_tags else text.split()

    # Return unique tags as a comma-separated string
    unique_tags = ', '.join(set(tags)) if tags else None

    return unique_tags

def process_csv(csv_content):
    # Convert CSV content to a DataFrame
    df = pd.read_csv(io.StringIO(csv_content))

    # Drop rows with null values in the 'input' column
    df = df.dropna(subset=['input'])

    # Apply the function to each row in the 'input' column
    df['tags'] = df['input'].apply(generate_tags)

    # Display the resulting DataFrame with 'input' and 'tags'
    print(df[['input', 'tags']])
    
    return df

# Example CSV content
csv_content = """input
John Smith is an experienced web developer with expertise in HTML, CSS, JavaScript, PHP, and MySQL. He has successfully delivered numerous web projects and is known for his attention to detail.


"""

# Process the CSV content and generate tags
result_df = process_csv(csv_content)

# Convert the resulting DataFrame to a dictionary for further usage
result_dict = result_df.to_dict(orient='records')

# Display the dictionary
print(result_dict)




                                                                                                                      input  \
John Smith is an experienced web developer with...  CSS  JavaScript  PHP   and MySQL. He has successfully delivered nume...   

                                                                                                                       tags  
John Smith is an experienced web developer with...  CSS  JavaScript  PHP  known, is, for, his, MySQL., successfully, num...  
[{'input': ' and MySQL. He has successfully delivered numerous web projects and is known for his attention to detail.', 'tags': 'known, is, for, his, MySQL., successfully, numerous, projects, has, and, He, to, delivered, attention, web, detail.'}]
