In [1]:
import pandas as pd
import glob

def load_all_csv_to_df(directory_path):
    # Create a pattern for glob to match all CSV files in the directory
    csv_pattern = f"{directory_path}/*.csv"
    
    # Use glob to get a list of CSV files in the directory
    csv_files = glob.glob(csv_pattern)
    
    # Read each CSV file into a DataFrame and store them in a list
    df_list = [pd.read_csv(file) for file in csv_files]

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)

    return combined_df

# Usage
directory_path = "../data/transcripts"
combined_df = load_all_csv_to_df(directory_path)


In [2]:
print(combined_df)

                                                   Name  \
0                                  Lisbeth Bech-Nielsen   
1                   Første næstformand Leif Lahn Jensen   
2     Social- og boligministeren Pernille Rosenkrant...   
3                   Første næstformand Leif Lahn Jensen   
4                                  Lisbeth Bech-Nielsen   
...                                                 ...   
1114                                Kim Edberg Andersen   
1115                    Den fg. formand Bjarne Laustsen   
1116                                     Marianne Bigum   
1117                    Den fg. formand Bjarne Laustsen   
1118                    Den fg. formand Bjarne Laustsen   

                                                   Text      ID  duration  \
0     Når vi er samlet her i Folketingssalen i dag, ...  980460     108.0   
1     Vi siger tak til ordføreren forslagsstillerne....  980495      23.0   
2     Med det her beslutningsforslag ønsker forslags...  980

In [3]:
# Save the DataFrame to a CSV file
combined_df.to_csv("../data/combined_transcripts.csv", index=False)

In [4]:
# load politivans.cdsv
politicians_df = pd.read_csv("../data/politicians.csv", delimiter=";")

# clean combined_df['name'] by removing everything that isnt in the politicians_df['Navn']
clean = combined_df[combined_df['Name'].isin(politicians_df['Navn'])]


In [5]:
clean

Unnamed: 0,Name,Text,ID,duration,end,start,start_min,end_min,nr
0,Lisbeth Bech-Nielsen,"Når vi er samlet her i Folketingssalen i dag, ...",980460,108.0,198,90,1:30,3:18,38
4,Lisbeth Bech-Nielsen,"Jeg synes faktisk, det er uordentligt af minis...",980548,60.0,586,526,8:46,9:46,38
8,Lisbeth Bech-Nielsen,SF og vores socialordfører har ikke forpasset ...,980303,36.0,692,656,10:56,11:32,38
20,Brigitte Klintskov Jerkel,"Tak. Jeg kunne godt tænke mig at vide, hvorfor...",980448,61.0,973,912,15:12,16:13,38
24,Brigitte Klintskov Jerkel,"Jeg synes ærlig talt, det er lidt ufint, at mi...",980456,42.0,1081,1039,17:19,18:01,38
...,...,...,...,...,...,...,...,...,...
1108,Morten Messerschmidt,"Tak, formand. Der er ingen tvivl om, at jordfo...",980921,175.0,13434,13259,220:59,223:54,40
1110,Marianne Bigum,"Mange tak, og mange tak til ordføreren for til...",980923,49.0,13487,13438,223:58,224:47,40
1112,Morten Messerschmidt,"Med det forbehold, at jeg ikke kender ordfører...",980925,48.0,13537,13489,224:49,225:37,40
1114,Kim Edberg Andersen,"Tak for det. Selv om jeg ikke er i tvivl om, a...",980927,298.0,13850,13552,225:52,230:50,40


In [6]:
#save clean to csv
clean.to_csv("../data/clean_transcripts.csv", index=False)

In [8]:
import pandas as pd

# Step 1: Tokenize the Names
# Create a set of unique words from politicians
unique_name_words = set(" ".join(politicians_df['Navn']).split())

# Step 2: Filter Out Unwanted Words
# Define a function to keep only the words that are in unique_name_words
def remove_unwanted_words(name):
    return ' '.join(word for word in name.split() if word in unique_name_words)

# Apply the function to the 'Name' column in combined_df
combined_df['Name'] = combined_df['Name'].apply(remove_unwanted_words)

# remove NA rows
combined_df = combined_df.dropna()


# then remove names that dont match exactly (there could be solo last or first names)

# Create a set of valid full names from df_names
valid_full_names = set(politicians_df['Navn'])

# Filter rows in df_with_titles where the name is not in valid_full_names
df_with_titles = combined_df[combined_df['Name'].isin(valid_full_names)]

# Result
print(combined_df)

# save with new name
combined_df.to_csv("../data/clean_maybe.csv", index=False)


                            Name  \
0           Lisbeth Bech-Nielsen   
1               Leif Lahn Jensen   
2     Pernille Rosenkrantz-Theil   
3               Leif Lahn Jensen   
4           Lisbeth Bech-Nielsen   
...                          ...   
1114         Kim Edberg Andersen   
1115             Bjarne Laustsen   
1116              Marianne Bigum   
1117             Bjarne Laustsen   
1118             Bjarne Laustsen   

                                                   Text      ID  duration  \
0     Når vi er samlet her i Folketingssalen i dag, ...  980460     108.0   
1     Vi siger tak til ordføreren forslagsstillerne....  980495      23.0   
2     Med det her beslutningsforslag ønsker forslags...  980506     300.0   
3     Tak til ministeren. Der er en række spørgere, ...  980517       5.0   
4     Jeg synes faktisk, det er uordentligt af minis...  980548      60.0   
...                                                 ...     ...       ...   
1114  Tak for det. Selv om j

In [22]:
import pandas as pd

# Example DataFrames
df_with_titles = pd.DataFrame({'name': ['Formanden Søren Gade', 'Minister Jens Joel', 'Martin Hansen']})
#df_names = pd.DataFrame({'name': ['Søren Gade', 'Jens Joel']})
print(df_with_titles)

politicians = pd.read_csv('../data/politicians.csv')

def remove_titles(df_with_titles, df_names):
    # Create a set of unique words from df_names
    unique_name_words = set(" ".join(df_names['name']).split())

    # Define a function to keep only the words that are in unique_name_words
    def remove_unwanted_words(name):
        return ' '.join(word for word in name.split() if word in unique_name_words)

    # Apply the function to the 'name' column in df_with_titles
    df_with_titles['name'] = df_with_titles['name'].apply(remove_unwanted_words)

    # Create a set of valid full names from df_names
    valid_full_names = set(df_names['name'])

    # Filter rows in df_with_titles where the name is not in valid_full_names
    df_with_titles = df_with_titles[df_with_titles['name'].isin(valid_full_names)]

    return df_with_titles

df_with_titles = remove_titles(df_with_titles, politicians)
# Result
print(df_with_titles)


                   name
0  Formanden Søren Gade
1    Minister Jens Joel
2         Martin Hansen
         name
0  Søren Gade
1   Jens Joel
