# Library

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [6]:
def extract_top_keywords_tfidf(df, num_keywords=3):
    """
    Extract the top keywords from the threads using TF-IDF.
    :param df: DataFrame containing the threads to process
    :param num_keywords: Number of top keywords to extract
    :return: DataFrame with top keywords for each thread
    """
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['name_thread'])

    # Get feature names
    feature_names = vectorizer.get_feature_names_out()

    # Add columns for top keywords
    for i in range(1, num_keywords + 1):
        df[f'top_keyword_{i}'] = None

    # Extract top keywords for each thread
    for index in tqdm(range(len(df)), total=len(df), desc='Extracting top keywords'):
        tfidf_vector = tfidf_matrix[index]
        sorted_indices = tfidf_vector.toarray().argsort()[0][-num_keywords:][::-1]
        top_keywords = [feature_names[i] for i in sorted_indices]

        for i in range(num_keywords):
            df.at[index, f'top_keyword_{i+1}'] = top_keywords[i]

    return df

In [7]:
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = extract_top_keywords_tfidf(df)
df

Extracting top keywords: 100%|██████████| 292446/292446 [06:46<00:00, 718.69it/s]


Unnamed: 0,name_board,creator_thread,creator_id_thread,name_thread,created_on,registration_date,total_posts,reputation,creator_post,creator_id_post,content,created_on_post,top_keyword_1,top_keyword_2,top_keyword_3
0,Tor,blonger,32544.0,Tor shrinked contribute,2020-01-09,1900-01-01 00:00:00,0.0,0.0,[deleted],37.0,[removed],2019-10-16,shrinked,contribute,tor
1,Tor,blonger,32544.0,Tor shrinked contribute,2020-01-09,1900-01-01 00:00:00,0.0,0.0,blonger,32544.0,The Torproject recently ***LINK***announced[ht...,2019-10-30,shrinked,contribute,tor
2,Tor,Xanitforthecash,2467.0,run tor node,2020-01-09,2018-03-21 00:00:00,88.0,9.0,rswz,32661.0,***LINK***http://ea5faa5po25cf7fb.onion/projec...,2019-10-16,node,run,tor
3,Tor,Xanitforthecash,2467.0,run tor node,2020-01-09,2018-03-21 00:00:00,88.0,9.0,ChemistWeb,33150.0,if someone will do something bad from that ip ...,2019-10-16,node,run,tor
4,Tor,Xanitforthecash,2467.0,run tor node,2020-01-09,2018-03-21 00:00:00,88.0,9.0,Xanitforthecash,2467.0,I already leave tor open for a good amount of ...,2019-10-16,node,run,tor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279432,,,,,,,,,,,,,danknationlotto,powerball,prize
289044,,,,,,,,,,,,,bugreport,hugbunter,zzzzz
289045,,,,,,,,,,,,,bugreport,hugbunter,zzzzz
289046,,,,,,,,,,,,,bugreport,hugbunter,zzzzz


In [8]:
df.to_csv('cleaned_data_name_thread_TF-IDF_top_keywords.csv', index=False)