In [None]:
# Required installations and imports
!pip install gensim
import pandas as pd
from gensim.models import Word2Vec
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load the word dictionary CSV file
word_dict_path = 'word_dict.csv'
word_dict_df = pd.read_csv(word_dict_path, encoding='utf-8')

# Convert the CSV data into a dictionary format
seed_words = {col: word_dict_df[col].dropna().tolist() for col in word_dict_df.columns}

# Download stopwords and set up stopword list
all_stopwords = set(stopwords.words('english'))

# Preprocess text function for Word2Vec training
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', str(text).lower())
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 1 and not token.isnumeric()]
    tokens = [token for token in tokens if token not in all_stopwords]
    return tokens

# Load the lyrics dataset from your uploaded file
lyrics_path = 'clean_unlabeled_utf8.csv'
df = pd.read_csv(lyrics_path, encoding='utf-8')
df['processed_lyrics'] = df['lyrics'].apply(preprocess_text)

# Train a Word2Vec model using the processed lyrics
all_lyrics = df['processed_lyrics'].tolist()
w2v_model = Word2Vec(sentences=all_lyrics, vector_size=100, window=5, min_count=1, workers=4)

# Function to expand seed words with top5 and high similarity threshold
def expand_seed_words(seed_words, model, top_n=5, similarity_threshold=0.7):
    expanded_words = {}
    for category, words in seed_words.items():
        expanded_set = set(words) 
        for word in words:
            if word in model.wv:
                similar_words = model.wv.most_similar(word, topn=top_n)
                expanded_set.update([w for w, sim in similar_words if sim >= similarity_threshold])
        expanded_words[category] = list(expanded_set)
    return expanded_words

# Expand seed words using the Word2Vec model
expanded_seed_words = expand_seed_words(seed_words, w2v_model, top_n=5, similarity_threshold=0.95)

# Display the final expanded dictionary
print("Expanded Seed Words with Original Dictionary:", expanded_seed_words)

# Save expanded seed words as a new CSV file (optional)
expanded_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in expanded_seed_words.items()]))
expanded_df.to_csv('expanded_seed_words.csv', index=False, encoding='utf-8-sig')
print("Expanded seed words saved to expanded_seed_words.csv")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Expanded Seed Words with Original Dictionary: {'Sexual': ['one night stand', 'pound', 'cumshot', 'ejaculate', 'wearin', 'occasions', 'attraction', 'underwear', 'sexual act', 'blow job', 'pistol', 'devoted', 'ass', 'dick', 'balls', 'dink', 'hopelessly', 'cabbage', 'thugged', 'erotic', 'sensual', 'masturbate', 'panties', 'handjob', 'breast', 'masturbation', 'provocative', 'sex', 'vagina', 'nudity', 'twins', 'nude', 'virgin', 'flirt', 'seductive', 'brassiere', 'dabei', 'make love', 'dub', 'bra', 'orgasm', 'socks', 'rappers', 'mask'], 'Violence': ['gore', 'combat', 'atlanta', 'violence', 'atrocity', 'punch', 'teachers', 'gun', 'knife', 'murder', 'assault', 'ð½ñ', 'stab', 'rat', 'luftballons', 'violent', 'gin', 'attack', 'chillin', 'pasito', 'mutilation', 'fikir', 'battle', 'korea', 'devilry', 'weapons', 'kill'], 'Substance': ['card', 'heroin', 'intoxicated', 'beer', 'guajira', 'foreign', 'del', 'vodka', 'alcohol', 'vamos', 'disease', 'ritz', 'balls', 'trickin', 'dealers', 'opioids', 'ecsta