# Removing Duplicates

In topcat I looked for duplicates in the step 4 files using a seperation of 0.05 arcsecond. Any higher, and the duplicates aren't actual duplicates. 
I found that the SMC doesn't have any duplicates in topcat, but the lmc does. 
I'm removing duplicates in two steps:  
1. I remove rows that are exactly the same, but were flagged as different due to floating point precision in the coordinates. 
2. I take a group of duplicates and check if the only difference between them is that some elements are nan as opposed to finite. I then replace the nans with those finite elements and remove the duplicated row. 

In [None]:
import pandas as pd
import numpy as np

# Load step 4 data
f = 'C:/Projects/0_Data/SUMS_CompleteCatalog/Step4/lmc_photometry_w_duplicates.csv'
df = pd.read_csv(f)


tf = pd.read_csv('C:/Projects/0_Data/SUMS_CompleteCatalog/Step4/lmc_photometry_topcat_groups.csv')
# It looks like all the duplicates only come in pairs 
group_size = tf.GroupSize
print("Max group size: ",group_size.max())
print("Number of rows: ",tf.shape[0])
n_dupes = tf.loc[~tf.GroupID.isna()].shape[0]/2 # In half because one row in real 
print("Number of duplicated rows: ",n_dupes) 

# Make a key so it's easier to find the duplicates.
tf['key'] = np.arange(tf.shape[0])

# Get a list of columns ignoring ra/dec because of float precision, the new key we just made, and the topcat group columns
columns = tf.columns[2:-3]

Max group size:  2.0
Number of rows:  472388
Number of duplicated rows:  881.0


## Step 1: 

In [2]:
# These rows are the same except for the ra/dec columns.
rows = tf.loc[11036:11037]
print("Are the ra elements the same? ",rows.ra.iloc[0] == rows.ra.iloc[1])
rows

Are the ra elements the same?  False


Unnamed: 0,ra,dec,U,e_U,B,e_B,V,e_V,I,e_I,...,uvw1_resid_frac_std,uvw1_std_unweighted,uvw1_num5,uvw1_num2p5,uvw1_num1,uvw1_closest_min,uvw1_dist_moved,GroupID,GroupSize,key
11036,75.932325,-69.39285,16.163,0.043,16.559,0.059,16.39,0.057,16.333,0.06,...,0.033558,0.247978,2.0,0.0,0.0,4.005221,0.051471,2.0,2.0,11036
11037,75.932325,-69.39285,16.163,0.043,16.559,0.059,16.39,0.057,16.333,0.06,...,0.033558,0.247978,2.0,0.0,0.0,4.005221,0.051471,2.0,2.0,11037


In [3]:
# Get the unique group ids
ids = tf.loc[~tf.GroupID.isna(),'GroupID'].unique()

drop_ids = []
group_ids = []

# For each thing top cat flagged as being a duplicate 
for group_id in ids:
    # Grab everything in that group 
    group = tf.loc[tf.GroupID == group_id].reset_index(drop=True)
    # If group is completely equal, then remove everything but the first row. 
    if group.loc[0,columns].equals(group.loc[1,columns]):
        indices = group['key'].values
        group_ids.append(indices)
        drop_ids.append(indices[1])
        
group_ids = np.concatenate(group_ids)

# Keep what were dropping in seperate df 
drop_tf = tf.loc[tf['key'].isin(drop_ids)].copy()
# The rows that are equal 
equal_tf = tf.loc[tf['key'].isin(group_ids)].copy()
# Reset group id so we dont consider them further 
tf.loc[tf['key'].isin(group_ids),'GroupID'] = np.nan
# Lose the duplicates 
tf = tf.loc[~tf['key'].isin(drop_ids)].reset_index(drop=True)

print("Rows removed: ",drop_tf.shape[0])
print("New dataframe size: ",tf.shape[0])
n_dupes2 = n_dupes - drop_tf.shape[0]
print("Dupes remaining: ",n_dupes2)

Rows removed:  351
New dataframe size:  472037
Dupes remaining:  530.0


## Step 2:

In [4]:
# This row is mostly the same except some elements are empty when they shouldnt be 
pd.set_option('display.max_columns', 500)
rows = tf.loc[8342:8343].reset_index(drop=True)
for col in columns:
    if np.isfinite(rows[col].iloc[0]) and np.isfinite(rows[col].iloc[1]): # If both elements are finite
        if rows[col].iloc[0] != rows[col].iloc[1]: # Are they the same? 
            print("Column is not the same: ",col)
rows

Unnamed: 0,ra,dec,U,e_U,B,e_B,V,e_V,I,e_I,uvw2_num_obs,uvw2_num_outliers,uvw2_mag,uvw2_mag_err,uvw2_mag_std,uvw2_flux_frac,uvw2_flux_frac_std,uvw2_resid_frac,uvw2_resid_frac_std,uvw2_std_unweighted,uvw2_num5,uvw2_num2p5,uvw2_num1,uvw2_closest_min,uvw2_dist_moved,uvm2_num_obs,uvm2_num_outliers,uvm2_mag,uvm2_mag_err,uvm2_mag_std,uvm2_flux_frac,uvm2_flux_frac_std,uvm2_resid_frac,uvm2_resid_frac_std,uvm2_std_unweighted,uvm2_num5,uvm2_num2p5,uvm2_num1,uvm2_closest_min,uvm2_dist_moved,uvw1_num_obs,uvw1_num_outliers,uvw1_mag,uvw1_mag_err,uvw1_mag_std,uvw1_flux_frac,uvw1_flux_frac_std,uvw1_resid_frac,uvw1_resid_frac_std,uvw1_std_unweighted,uvw1_num5,uvw1_num2p5,uvw1_num1,uvw1_closest_min,uvw1_dist_moved,GroupID,GroupSize,key
0,75.85722,-69.18017,16.771,0.053,17.0,0.033,16.839,0.037,17.114,0.043,3.0,0.0,16.84889,0.04756,0.066529,0.868941,0.015112,0.070641,0.013111,0.069005,3.0,0.0,0.0,3.264423,0.046578,2.0,0.0,16.675382,0.068086,0.080844,0.924575,0.004453,0.090275,0.060773,0.129463,2.0,0.0,0.0,3.282053,0.039688,,,,,,,,,,,,,,,,1.0,2.0,8342
1,75.85722,-69.18017,16.771,0.053,17.0,0.033,16.839,0.037,17.114,0.043,3.0,0.0,16.84889,0.04756,0.066529,0.868941,0.015112,0.070641,0.013111,0.069005,3.0,0.0,0.0,3.264423,0.046578,2.0,0.0,16.675382,0.068086,0.080844,0.924575,0.004453,0.090275,0.060773,0.129463,2.0,0.0,0.0,3.282053,0.039688,1.0,0.0,16.821537,0.097883,,0.874143,,0.088685,,,3.0,0.0,0.0,3.21816,0.108944,1.0,2.0,8343


In [5]:
ids = tf.loc[~tf.GroupID.isna(),'GroupID'].unique()
drop_ids = []
keep_ids = []

for group_id in ids:
    # Get the group 
    group = tf.loc[tf.GroupID == group_id].reset_index(drop=True)
    first_row = group.loc[0]
    second_row = group.loc[1]
    for column in columns:
        # If one element is nan but the other is finite then replace it. 
        if np.isnan(first_row[column]) and np.isfinite(second_row[column]):
            group.loc[0,column] = group.loc[1,column]

        # If both values are finite but unequal print them 
        elif np.isfinite(first_row[column]) and np.isfinite(second_row[column]) and first_row[column] != second_row[column]:
            # The values printing out seem to only be due to floating point issues.
            print(f'Group Number: {group.loc[0,'GroupID']} had finite but unequal columns: {group.loc[0,column]},{group.loc[1,column]}')
            
    tf.loc[tf['key'] == first_row['key']] = group.loc[0].values
    drop_ids.append(group.loc[1,'key'])
    keep_ids.append(group.loc[0,'key'])



Group Number: 87.0 had finite but unequal columns: 0.372,0.3720000000000001
Group Number: 803.0 had finite but unequal columns: 0.372,0.3720000000000001


In [6]:
# These are now the same 
rows = tf.loc[8342:8343].reset_index(drop=True)
for col in columns:
        if rows[col].iloc[0] != rows[col].iloc[1]: # Are they the same? 
            if np.isfinite(rows[col].iloc[0]) and np.isfinite(rows[col].iloc[1]): # If they're both nan it will seem like theyre different when theyre not 
                print("Column is not the same: ",col)
rows

Unnamed: 0,ra,dec,U,e_U,B,e_B,V,e_V,I,e_I,uvw2_num_obs,uvw2_num_outliers,uvw2_mag,uvw2_mag_err,uvw2_mag_std,uvw2_flux_frac,uvw2_flux_frac_std,uvw2_resid_frac,uvw2_resid_frac_std,uvw2_std_unweighted,uvw2_num5,uvw2_num2p5,uvw2_num1,uvw2_closest_min,uvw2_dist_moved,uvm2_num_obs,uvm2_num_outliers,uvm2_mag,uvm2_mag_err,uvm2_mag_std,uvm2_flux_frac,uvm2_flux_frac_std,uvm2_resid_frac,uvm2_resid_frac_std,uvm2_std_unweighted,uvm2_num5,uvm2_num2p5,uvm2_num1,uvm2_closest_min,uvm2_dist_moved,uvw1_num_obs,uvw1_num_outliers,uvw1_mag,uvw1_mag_err,uvw1_mag_std,uvw1_flux_frac,uvw1_flux_frac_std,uvw1_resid_frac,uvw1_resid_frac_std,uvw1_std_unweighted,uvw1_num5,uvw1_num2p5,uvw1_num1,uvw1_closest_min,uvw1_dist_moved,GroupID,GroupSize,key
0,75.85722,-69.18017,16.771,0.053,17.0,0.033,16.839,0.037,17.114,0.043,3.0,0.0,16.84889,0.04756,0.066529,0.868941,0.015112,0.070641,0.013111,0.069005,3.0,0.0,0.0,3.264423,0.046578,2.0,0.0,16.675382,0.068086,0.080844,0.924575,0.004453,0.090275,0.060773,0.129463,2.0,0.0,0.0,3.282053,0.039688,1.0,0.0,16.821537,0.097883,,0.874143,,0.088685,,,3.0,0.0,0.0,3.21816,0.108944,1.0,2.0,8342
1,75.85722,-69.18017,16.771,0.053,17.0,0.033,16.839,0.037,17.114,0.043,3.0,0.0,16.84889,0.04756,0.066529,0.868941,0.015112,0.070641,0.013111,0.069005,3.0,0.0,0.0,3.264423,0.046578,2.0,0.0,16.675382,0.068086,0.080844,0.924575,0.004453,0.090275,0.060773,0.129463,2.0,0.0,0.0,3.282053,0.039688,1.0,0.0,16.821537,0.097883,,0.874143,,0.088685,,,3.0,0.0,0.0,3.21816,0.108944,1.0,2.0,8343


In [7]:
# Keep what were dropping in seperate df 
drop_tf2 = tf.loc[tf['key'].isin(drop_ids)].copy()

# Reset group id so we dont consider them further 
tf.loc[tf['key'].isin(keep_ids),'GroupID'] = np.nan

# Lose the duplicates 
tf = tf.loc[~tf['key'].isin(drop_ids)].reset_index(drop=True)

print("Rows removed: ",drop_tf2.shape[0])
print("New dataframe size: ",tf.shape[0])
n_dupes3 = n_dupes2 - drop_tf2.shape[0]
print("Dupes remaining: ",n_dupes3)

Rows removed:  530
New dataframe size:  471507
Dupes remaining:  0.0


## Step 3:

In [8]:
# We can now remove the groupid and group size and key columns and resave 
# Ill double check in top cat that no groups remain 
tf = tf.drop(columns=['GroupID','GroupSize','key']).reset_index(drop=True)
print('Final shape: ',tf.shape)
tf.to_csv('C:/Projects/0_Data/SUMS_CompleteCatalog/Step4/lmc_photometry.csv',index=False)

Final shape:  (471507, 55)
