# Update Script for candidates
Takes new csv files and old csv files as input and filters pairs from old csv files from the new ones

In [1]:
import pandas as pd
import numpy as np

## Input Files

### New CSVs

In [2]:
new_dir_prefix = "./output/"
new_csv_dirs = [
    f"{new_dir_prefix}candidates_nsp.with_categories.20000.csv",
    f"{new_dir_prefix}candidates_random.with_categories.20000.csv",
    f"{new_dir_prefix}candidates_similarity.with_categories.20000.csv"
]

### Old CSVs

In [3]:
old_dir_prefix = "./old_output/"
old_csv_dirs = [
    f"{old_dir_prefix}candidates_nsp.10000.csv",
    f"{old_dir_prefix}candidates_nsp.with_categories.10000.csv",
    f"{old_dir_prefix}candidates_random.10000.csv",
    f"{old_dir_prefix}candidates_random.with_categories.10000.csv",
    f"{old_dir_prefix}candidates_similarity.10000.csv",
    f"{old_dir_prefix}candidates_similarity.with_categories.10000.csv",
    f"{old_dir_prefix}candidates_nsp.with_categories.10000.03.06.csv",
    f"{old_dir_prefix}candidates_similarity.with_categories.10000.03.06.csv"
]

## Run Filter

### Create Reference Dataframe of old CSVs

In [4]:
# Create new large Dataframe
old_csv_list = []
for d in old_csv_dirs:
    old_csv_list.append(pd.read_csv(d))
    
old_df = pd.concat(old_csv_list, ignore_index=True)
old_df

Unnamed: 0,a_doc_id,a_start,a_end,a_text,a_url,a_title,b_doc_id,b_start,b_end,b_text,b_url,b_title,a_categories,b_categories
0,741,54,84,New leadership could prove to be the key to re...,https://en.wikinews.org/wiki?curid=741,Palestinians to elect new president on January 9,88082,184,207,The month-long war began in July last year whe...,https://en.wikinews.org/wiki?curid=88082,Israel says cluster bomb use was legal,,
1,741,54,84,New leadership could prove to be the key to re...,https://en.wikinews.org/wiki?curid=741,Palestinians to elect new president on January 9,75913,297,320,This will be the largest peacekeeping force ev...,https://en.wikinews.org/wiki?curid=75913,"UN to send troops to Darfur, Sudan",,
2,741,85,120,The Haaretz had initially reported that former...,https://en.wikinews.org/wiki?curid=741,Palestinians to elect new president on January 9,102566,68,79,"Since 1994, she has held various posts at the ...",https://en.wikinews.org/wiki?curid=102566,Zinaida Greceanii nominated Moldovan Prime Min...,,
3,741,85,120,The Haaretz had initially reported that former...,https://en.wikinews.org/wiki?curid=741,Palestinians to elect new president on January 9,20178,120,139,A lawsuit was filed in 2000 against the Palest...,https://en.wikinews.org/wiki?curid=20178,Rhode Island District Court freezes Palestinia...,,
4,736,0,37,"Hu Jintao, the President of the People's Repub...",https://en.wikinews.org/wiki?curid=736,President of China lunches with Brazilian Pres...,140463,171,195,Give us this chance and you will not regret it...,https://en.wikinews.org/wiki?curid=140463,Rio de Janeiro to host 2016 Olympics,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32172,152378,164,195,Gary Jenkins of Evolution Securities told the ...,https://en.wikinews.org/wiki?curid=152378,Concern about sovereign debt of some EU member...,131162,159,191,He's saying that the global economy is startin...,https://en.wikinews.org/wiki?curid=131162,Stock markets worldwide rise on hopes of US ec...,"Europe,Greece,Portugal,Spain,North America,Uni...","2008-09 financial crisis,Economy and business,..."
32173,99084,2006,2029,A protest in Manchester took place outside the...,https://en.wikinews.org/wiki?curid=99084,"Wikinews international report: ""Anonymous"" hol...",98498,1129,1149,There are expected to be at least 217 protests...,https://en.wikinews.org/wiki?curid=98498,"Church of Scientology: '""Anonymous' will be st...","California,Canada,Culture and entertainment,Cr...","Crime and law,Internet,Religion,Science and te..."
32174,183714,161,189,"The problem was a metal frame on the tube, whi...",https://en.wikinews.org/wiki?curid=183714,Efforts to cap Deepwater Horizon oil spill del...,207153,0,42,"Early on Sunday morning, those aboard Developm...",https://en.wikinews.org/wiki?curid=207153,"Deepwater Horizon oil well finally dead, autho...","United States,Disasters and accidents,Economy ...","North America,United States,Disasters and acci..."
32175,2089569,106,119,This was not the only volcanic eruption in the...,https://en.wikinews.org/wiki?curid=2089569,Volcano erupts in southern Chile,123643,149,160,The volcano had not erupted since a four-month...,https://en.wikinews.org/wiki?curid=123643,"Volcano near Anchorage, Alaska erupts, airport...","disasters and accidents,Chile,South America,Vo...","North America,United States,Alaska,Natural dis..."


### Filter New CSVs

In [5]:
count = 0
# columns to take into account
index_columns = ['a_doc_id', 'a_start', 'a_end', 'b_doc_id', 'b_start', 'b_end']
for d in new_csv_dirs:
    # filter
    df = pd.read_csv(d)
    filtered_df = pd.merge(df, old_df, on=index_columns, how='outer', indicator=True, suffixes=("", "_y"))
    filtered_df = filtered_df[filtered_df['_merge'] == 'left_only']
    filtered_df = filtered_df.drop(columns=['a_text_y', 'a_url_y', 'a_title_y', 'b_text_y', 'b_url_y', 'b_title_y', 'a_categories_y', 'b_categories_y', '_merge'])
    # difference of dfs -> first try, not sufficiant
    #filtered_df = df[~df[index_columns].isin(old_df[index_columns])].dropna().convert_dtypes()
    print(f"Dataset_{count} Length: {len(filtered_df)} ({len(df)-len(filtered_df)} duplicates dropped)")
    # overwrite file
    filtered_df.to_csv(new_csv_dirs[count], index=False)
    count += 1

Dataset_0 Length: 749 (350 duplicates dropped)
Dataset_1 Length: 1000 (0 duplicates dropped)
Dataset_2 Length: 1045 (752 duplicates dropped)
