### Changelog

- Need to create dynamic data loading/preprocessing/saving (for any number of files)

### Import Libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance
from datetime import datetime 
import os

### Loading Dataset 

In [11]:
def load_df_from_dir(dir_path):
    csv_files = [f for f in os.listdir(dir_path) if f.endswith(".csv")]
    layouts = []

    for csv_file in csv_files:
        file_path = os.path.join(dir_path, csv_file)
        df = pd.read_csv(file_path)
        df['source'] = os.path.splitext(csv_file)[0]
        layouts.append(df)
    
    return layouts

In [12]:
# layouts = load_df_from_dir('dataset_clean')
layouts = load_df_from_dir('../input_files')

### Save DF to CSV

In [13]:
# import os
# import pandas as pd

# def save_dfs_to_csv(layouts, output_dir):

#     os.makedirs(output_dir, exist_ok=True)
    
#     for i, df in enumerate(layouts, start=1):
#         file_name = f"layout{i}.csv"
#         file_path = os.path.join(output_dir, file_name)
#         df.to_csv(file_path, index=False)
#         print(f"Saved {file_path}")


### Data Preprocessing

In [14]:
for layout in layouts:
    layout['last_modified_date'] = datetime.now()

def sanitize(df):
    return df.map(lambda x: x.replace(',', '').replace(' ', '').strip() if isinstance(x, str) else '' if pd.isna(x) else x)
    
def create_soup(df, df_, soup, soup_name):
    df[soup_name] = df_[soup].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

layout_copies = [layout.copy() for layout in layouts]
soup = ['Name', 'Date of Birth', 'Father_Name']

for layout, layout_copy, in zip(layouts, layout_copies):
    layout_copy = sanitize(layout_copy)
    create_soup(layout, layout_copy, soup, "soup")

### Entity Matching

In [15]:
def combine_layouts(A, B, metric='cosine', threshold=0.8):
    def calculate_similarity(A, B, metric):
        if metric == 'cosine':
            tfidf = TfidfVectorizer(stop_words='english')
            combined_soup = pd.concat([A['soup'], B['soup']], ignore_index=True)
            tfidf.fit(combined_soup)
            tfidf_matrix_A = tfidf.transform(A['soup'])
            tfidf_matrix_B = tfidf.transform(B['soup'])
            similarity = cosine_similarity(tfidf_matrix_A, tfidf_matrix_B)
            similarity_df = pd.DataFrame(similarity, index=A.index, columns=B.index)
            idx_row = similarity_df.idxmax(axis=1)
            similarity_mask = similarity_df.max(axis=1) > threshold
        else:
            distance_matrix = pd.DataFrame([[distance(a, b) for b in B['soup']] for a in A['soup']], index=A.index, columns=B.index)
            idx_row = distance_matrix.idxmin(axis=1)
            similarity_mask = distance_matrix.min(axis=1) <= threshold
        return idx_row, similarity_mask

    def merge_data(A, B, idx_row, similarity_mask):
        combined_columns = list(set(A.columns) | set(B.columns))
        combined_data = pd.DataFrame(columns=combined_columns)
        for idx_A in A.index:
            if similarity_mask[idx_A]:
                idx_B = idx_row[idx_A]
                combined_row = A.loc[idx_A].combine_first(B.loc[idx_B])
                combined_row['source'] = f"{A.loc[idx_A]['source']}, {B.loc[idx_B]['source']}"
                combined_row['last_modified_date'] = datetime.now()
            else:
                combined_row = A.loc[idx_A]
            combined_data = pd.concat([combined_data, combined_row.to_frame().T], ignore_index=True)
        new_records = B.loc[~B.index.isin(idx_row[similarity_mask].values)]
        return pd.concat([combined_data, new_records], ignore_index=True)

    idx_row, similarity_mask = calculate_similarity(A, B, metric)
    return merge_data(A, B, idx_row, similarity_mask)


### Saving Results

#### Save Intermittent Result

In [16]:
# def save_layouts(layouts, save_path):
#     final_df = layouts[0]
#     results = [final_df] 

#     initial_part = "1"   
#     for i, df in enumerate(layouts[1:], start=2):
#         final_df = combine_layouts(final_df, df)
#         results.append(final_df)
        
#         initial_part += str(i)
#         final_df.to_csv(f"./{save_path}/result{initial_part}.csv", index=False)
    
#     return final_df, results

In [17]:
# final_df, results = save_layouts(layouts, 'results')

#### Save final result only

In [18]:
def char_to_digit(char):
    if char.isdigit():
        return int(char)
    elif char.isalpha():
        return (ord(char.lower()) - ord('a') + 1) % 10
    else:
        return 0

def string_to_digits(s):
    digits = [char_to_digit(char) for char in s]
    numeric_string = ''.join(map(str, digits))
    
    # Ensure the string is exactly 9 digits long
    if len(numeric_string) > 13:
        return numeric_string[:13]
    else:
        return numeric_string.ljust(13, '0')

In [19]:
def save_layouts(layouts, save_path):
    final_df = layouts[0]

    for df in layouts[1:]:
        final_df = combine_layouts(final_df, df)
    
    final_df['uuid'] = final_df['soup'].apply(string_to_digits)
    final_result_path = os.path.join(save_path, 'final_result.csv')
    final_df.to_csv(final_result_path, index=False)
    return final_df

In [22]:
final_df = save_layouts(layouts, '../input_files')

In [23]:
final_df

Unnamed: 0,source,Permanent_Address,votersAge,Mobile Number,Mother Name,Temporary_Address,Date of Birth,Citizenship Number,PAN_Number,Customer Code,...,Gender,Father_Name,Blood Group,License Number,SC Number,Customer ID,National Id,Name,SpouseName,uuid
0,"layout1, layout2, layout3, layout4, layout5","Baluwatar, Kathmandu, Nepal",45.0,1234567890,Laxmi Thapa,"Gongabu, Kathmandu, Nepal",1990-01-01,624-93227-32431/660086,ABCDE1234F,21216874.0,...,Male,Ram Bahadur Thapa,AB+,15-05-58353205,001.01.01,3245.0,AB123C,Ram Thapa,Sita Thapa,8130816101990
1,"layout1, layout2, layout3, layout4, layout5","Lakeside, Pokhara, Nepal",38.0,2345678901,Radha Sharma,"New Road, Pokhara, Nepal",1991-02-02,747-42087-31417/584714,FGHIJ5678K,22359363.0,...,Female,Hari Prasad Shrestha,AB-,21-08-00435579,001.01.02,3246.0,DE456F,Sita Shrestha,Ravi Sharma,9901988590810
2,"layout1, layout2, layout3, layout4, layout5","Chitwan National Park, Chitwan, Nepal",52.0,3456789012,Gita Adhikari,"Pulchowk, Lalitpur, Nepal",1992-03-03,389-45382-93886/821590,LMNOP9012L,33485241.0,...,Male,Gopal Krishna Gurung,B-,93-12-35351480,001.01.03,3247.0,GH789I,Hari Gurung,Maya Adhikari,8189718147019
3,"layout1, layout2, layout3, layout4, layout5","Biratnagar, Morang, Nepal",30.0,4567890123,Mina Rai,"Bagbazar, Kathmandu, Nepal",1993-04-04,571-38785-99733/440035,QRSTU3456M,45475489.0,...,Female,Shyam Lal Tamang,A-,65-03-68139881,001.01.04,3248.0,JK012L,Gita Tamang,Surya Rai,7901013147019
4,"layout1, layout2, layout3, layout4","Bharatpur, Chitwan, Nepal",27.0,5678901234,Kalpana Karki,,1994-05-05,,VWXYZ7890N,56562139.0,...,Male,Krishna Raj Lama,,,001.01.05,3249.0,MN345O,Mohan Lama,Sarita Karki,3581421310199
5,"layout1, layout2, layout3, layout4, layout5","Butwal, Rupandehi, Nepal",41.0,6789012345,Nima Gurung,"Chabahil, Kathmandu, Nepal",1995-06-06,033-51347-62581/380746,ABCD1234PQ,67698214.0,...,Female,Narayan Kumar Magar,B-,82-09-81734599,001.01.06,3250.0,PQ678R,Radha Magar,Pemba Gurung,8148131718019
6,"layout1, layout2, layout3, layout4","Hetauda, Makwanpur, Nepal",36.0,7890123456,Saru Shrestha,"Kumaripati, Lalitpur, Nepal",1996-07-07,,EFGH5678RS,78714635.0,...,Male,Govinda Bahadur Rai,,,001.01.07,3251.0,ST901U,Krishna Rai,Laxmi Shrestha,1899841819019
7,"layout1, layout2, layout3, layout4, layout5","Janakpur, Dhanusa, Nepal",50.0,8901234567,Nanu Maharjan,"New Baneshwor, Kathmandu, Nepal",1997-08-08,323-37869-95909/623481,TUVW9012XY,89847326.0,...,Female,Shiva Narayan Sherpa,A-,98-03-72394228,001.01.08,3252.0,VW234X,Sarita Sherpa,Raj Maharjan,9189019858610
8,"layout1, layout2, layout3, layout4","Nepalgunj, Banke, Nepal",43.0,9012345678,Lhamu Tamang,"Boudha, Kathmandu, Nepal",1998-09-09,,ZABC3456DE,91953421.0,...,Male,Bhagirath Bahadur Karki,,,001.01.09,3253.0,YZ567A,Bikash Karki,Pema Tamang,2911981181901
9,"layout1, layout2, layout3, layout4, layout5","Dharan, Sunsari, Nepal",34.0,123456789,Lila KC,"Kalanki, Kathmandu, Nepal",1999-10-10,272-28301-42325/881177,FGHI7890JK,10203847.0,...,Female,Surya Bahadur Bhandari,A-,42-08-02614125,001.01.10,3254.0,BC890D,Nisha Bhandari,Manish KC,4998128144189


#### Save final_result and delete source files if successful

In [None]:
# import os
# import glob

# def save_layouts(layouts, save_path, save_filename):
#     final_df = layouts[0]

#     for df in layouts[1:]:
#         final_df = combine_layouts(final_df, df)
    
#     final_result_path = os.path.join(save_path, save_filename)
    
#     try:
#         final_df.to_csv(final_result_path, index=False)
#     except Exception as e:
#         print(f"Error saving final result: {e}")
#         return None

#     # If save is successful, delete all other files in save_path except final_result
#     files = glob.glob(os.path.join(save_path, '*'))
#     for f in files:
#         if f != final_result_path:
#             os.remove(f)

#     return final_df

In [None]:
# final_df = save_layouts(layouts, 'results', 'final_result.csv')