In [1]:
import pandas as pd
from collections import Counter
import re
import numpy as np
import textfunctions as tf


Prep text

In [None]:
df_names=pd.read_csv('Data/titles.csv')
df_all=pd.read_csv('text_and_community_infomap.csv')
df_all=df_all.drop_duplicates(subset=['Name','Text','Community ID'])
df_all['Text'] = df_all['Text'].fillna('')
df_all['Text'] = df_all['Text'].apply(tf.remove_names)
df_all['Stemmed Text'] = df_all['Text'].apply(tf.stem)
corpus=' '.join(df_all['Stemmed Text'])
df_all.to_csv('preppedtext.csv',index=False)

Opening and preparing dataframes

In [2]:
df_de=pd.read_csv('all_communities_infomap_de.csv',sep=';')
df_fr=pd.read_csv('all_communities_infomap_fr.csv',sep=';')
df_es=pd.read_csv('all_communities_infomap_es.csv',sep=';')
df_en=pd.read_csv('all_communities_infomap.csv',sep=';')
df_text=pd.read_csv('preppedtext.csv')
df_text=df_text[['Name','Stemmed Text']]

In [3]:
def prepdataframe(df):
    df=df[df['Year']==2024]
    df['Members']=df['Members'].apply(lambda x: x.split(', '))
    df=df.explode('Members')
    df.rename(columns = {'Members':'Name'}, inplace = True)
    df=df[['Community ID','Percentage of Women','Name','Size']]
    df=df.merge(df_text, on='Name', how='left')
    df['Stemmed Text']=df['Stemmed Text'].fillna('')
    df_com=df.groupby('Community ID').agg({'Stemmed Text':'sum'}).reset_index()
    communities=df_com['Community ID'].tolist()
    return df,df_com,communities

Calculating TF-IDF-scores

In [4]:
def tfidf(df,communities):
    community_texts = {comid: df[df['Community ID'] == comid]['Stemmed Text'].values[0] for comid in communities}
    community_word_counts = {comid: Counter(re.findall(r'\b\w+\b', text)) for comid, text in community_texts.items()}
    community_number = {comid: len(df[df['Community ID'] == comid]['Stemmed Text'].values[0].split()) for comid in communities}
    tfidf_scores = {}
    for communityid in communities:
        community_words = community_word_counts[communityid]    
        community_count=community_number[communityid]
        total_documents = len(df)
        TFIDFSCORES = {}
        for word, count in community_words.items():
            TF = count / community_count
            count_other_com = sum([1 for comid in communities if word in community_word_counts[comid] and comid != communityid])
            IDF = np.log(total_documents / (count_other_com+1))
            TFIDFSCORES[word] = TF*IDF      
        tfidflist = sorted(TFIDFSCORES.items(), key=lambda x: x[1], reverse=True)
        tfidf_scores[communityid] = tfidflist
    return tfidf_scores
def get_top_words(tfidf_scores,communities):
    top_words={}
    for comid in communities:
        top_words[comid]=[word for word,score in tfidf_scores[comid][:30]]
    return top_words  

In [5]:
def get_scores(df):
    df,df_com,communities=prepdataframe(df)
    tfidf_scores=tfidf(df_com,communities)
    top_words=get_top_words(tfidf_scores,communities)
    id_list=[]
    size_list=[]
    women_list=[]
    top_words_list=[]
    for com in communities:
        id_list.append(com)
        size_list.append(df[df['Community ID']==com]['Size'].values[0])
        women_list.append(df[df['Community ID']==com]['Percentage of Women'].values[0])
        top_words_list.append(top_words[com])
    df_output=pd.DataFrame()
    df_output['Community ID']=id_list
    df_output['Size']=size_list
    df_output['Percentage of female members']=women_list
    df_output['Top words']=top_words_list
    return df_output

In [None]:
output_de=get_scores(df_de)
output_fr=get_scores(df_fr)
output_es=get_scores(df_es)
output_en=get_scores(df_en)

output_de['Lang']='de'
output_fr['Lang']='fr'
output_es['Lang']='es'
output_en['Lang']='en'

In [7]:
output_de=output_de[output_de['Size']>50]
output_fr=output_fr[output_fr['Size']>50]
output_es=output_es[output_es['Size']>50]
output_en=output_en[output_en['Size']>50]

Converting to a table that can fit in appendix

In [8]:

def convert(df):
    df['Topwords1']=df['Top words'].apply(lambda x: ', '.join(x[:10]))
    df['Topwords2']=df['Top words'].apply(lambda x: ', '.join(x[10:20]))
    df['Topwords3']=df['Top words'].apply(lambda x: ', '.join(x[20:30]))
    df=df[['Community ID','Size','Percentage of female members','Topwords1','Topwords2','Topwords3','Lang']]
    reshaped_df = pd.melt(
    df,
        id_vars=["Community ID", "Size",'Percentage of female members','Lang'], 
        value_vars=['Topwords1','Topwords2','Topwords3'],  
        var_name="NewColumn", 
        value_name="NewValue" 
    )
    
    reshaped_df['Community ID-NewColumn']=reshaped_df['Community ID'].astype(str)+'-'+reshaped_df['NewColumn']
    reshaped_df=reshaped_df.sort_values('Community ID-NewColumn')

    return reshaped_df

output_de=convert(output_de)
output_fr=convert(output_fr)
output_es=convert(output_es)
output_en=convert(output_en)




Save TF-IDF data

In [9]:
output=pd.concat([output_de,output_fr,output_es,output_en])
output.to_csv('TFIDF-top30.csv',index=False)