In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

In [24]:
def load_df(filename, file_path='../data'):
    full_path = os.path.join(file_path, f'{filename}.csv')
    try:
        df = pd.read_csv(full_path)
        print(f"File '{filename}.csv' loaded successfully from '{file_path}'.")
        return df
    except FileNotFoundError:
        print(f"Error: The file '{filename}.csv' was not found in the directory '{file_path}'.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{filename}.csv' is empty.")
    except pd.errors.ParserError:
        print(f"Error: The file '{filename}.csv' could not be parsed.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [25]:
layout1 = load_df('ABC_layout_1')
layout2 = load_df('PQR_layout_2')
layout3 = load_df('layout_3_voters')
layout4 = load_df('KLM_layout_4')
layout5 = load_df('layout_5_license_dropped')

layout1 = layout1.rename(columns={"First Name": "Name", "Father Name": "Father_Name", "Permanent_Adress":"Permanent_Address"})
layout2 = layout2.rename(columns = {"Customer_ID": "Mobile Number"})
layout3 = layout3.rename(columns={"votersName": "Name", "votersFatherName": "Father_Name", "votersMotherName": "Mother Name", " Gender": "Gender", "Permanent_Adress":"Permanent_Address"})
layout4 = layout4.rename(columns={"Father Name": "Father_Name"})

del layout1["Last Name"]
del layout2["Unnamed: 0"]
del layout4["Unnamed: 0"]

File 'ABC_layout_1.csv' loaded successfully from '../data'.
File 'PQR_layout_2.csv' loaded successfully from '../data'.
File 'layout_3_voters.csv' loaded successfully from '../data'.
File 'KLM_layout_4.csv' loaded successfully from '../data'.
File 'layout_5_license_dropped.csv' loaded successfully from '../data'.


In [26]:
def sanitize(df):
    return df.map(lambda x: x.replace(',', '').replace(' ', '').strip() if isinstance(x, str) else '' if pd.isna(x) else x)

In [27]:
layouts = [layout1, layout2, layout3, layout4, layout5]

In [28]:
layout_copies = []
for layout in layouts:
    try:
        if layout is not None:
            sanitized_layout = sanitize(layout.copy())
            layout_copies.append(sanitized_layout)
        else:
            layout_copies.append(None)
    except Exception as e:
        print(f"An error occurred while sanitizing a layout: {e}")
        layout_copies.append(None)

In [29]:
for i in range(len(layout_copies)):
    layout_copies[i] = sanitize(layout_copies[i])

In [30]:
def create_soup(df, df_, soup, soup_name):
    df[soup_name] = df_[soup].apply(lambda x: ' '.join(x.values.astype(str)).lower(), axis=1)

soup = ['Name', 'Date of Birth', 'Father_Name']

for i, j, k, in zip(layouts, layout_copies, range(len(layouts))):
    create_soup(i, j, soup, f"soup{k+1}")

In [33]:
def combine(A, B, soup_A, soup_B, threshold=0.3):
    # Initialize the TF-IDF Vectorizer
    tfidf = TfidfVectorizer(stop_words='english')
    
    # Combine the textual data for fitting the TF-IDF model
    combined_soup = pd.concat([A[soup_A], B[soup_B]], ignore_index=True)
    tfidf.fit(combined_soup)
    
    # Transform the textual data into TF-IDF matrices
    tfidf_matrix_A = tfidf.transform(A[soup_A])
    tfidf_matrix_B = tfidf.transform(B[soup_B])
    
    # Compute cosine similarity between the two matrices
    similarity = cosine_similarity(tfidf_matrix_A, tfidf_matrix_B)
    similarity_df = pd.DataFrame(similarity, index=A.index, columns=B.index)
    
    # Determine the index of the most similar row in B for each row in A
    max_idx_row = similarity_df.idxmax(axis=1)
    similarity_mask = similarity_df.max(axis=1) >= threshold
    
    # Initialize the combined DataFrame with A, ensuring all columns from both DataFrames
    combined_columns = list(set(A.columns) | set(B.columns))
    combined_data = pd.DataFrame(columns=combined_columns)
    
    # Merge the similar rows 
    for idx_A in A.index:
        if similarity_mask[idx_A]:
            idx_B = max_idx_row[idx_A]
            combined_row = A.loc[idx_A].combine_first(B.loc[idx_B])
        else:
            combined_row = A.loc[idx_A]
        combined_data = pd.concat([combined_data, combined_row.to_frame().T], ignore_index=True)
    
    # Append non-similar rows from B to A
    new_records = B.loc[~B.index.isin(max_idx_row[similarity_mask].values)]
    result = pd.concat([combined_data, new_records], ignore_index=True)
    result.drop(columns=soup_B, inplace=True)
    return result

result_12 = combine(layout1, layout2, 'soup1', 'soup2')
result_123 = combine(result_12, layout3, 'soup1', 'soup3')
result_1234 = combine(result_123, layout4, 'soup1', 'soup4')
final_result = combine(result_1234, layout5, 'soup1', 'soup5')


In [34]:
final_result

Unnamed: 0,Mobile Number,SpouseName,votersID,Date of Birth,votersAge,Customer Code,Name,Blood Group,Father_Name,Permanent_Address,Customer ID,Mother Name,SC Number,PAN_Number,soup1,Temporary_Address,Citizenship Number,National Id,Gender,License Number
0,1234567890,Sita Thapa,11116874,1990-01-01,45,21216874,Ram Thapa,AB+,Ram Bahadur Thapa,"Baluwatar, Kathmandu, Nepal",3245,Laxmi Thapa,001.01.01,ABCDE1234F,ramthapa 1990-01-01 rambahadurthapa,"Gongabu, Kathmandu, Nepal",624-93227-32431/660086,AB123C,Male,15-05-58353205
1,2345678901,Ravi Sharma,22259363,1991-02-02,38,22359363,Sita Shrestha,AB-,Hari Prasad Shrestha,"Lakeside, Pokhara, Nepal",3246,Radha Sharma,001.01.02,FGHIJ5678K,sitashrestha 1991-02-02 hariprasadshrestha,"New Road, Pokhara, Nepal",747-42087-31417/584714,DE456F,Female,21-08-00435579
2,3456789012,Maya Adhikari,33385241,1992-03-03,52,33485241,Hari Gurung,B-,Gopal Krishna Gurung,"Chitwan National Park, Chitwan, Nepal",3247,Gita Adhikari,001.01.03,LMNOP9012L,harigurung 1992-03-03 gopalkrishnagurung,"Pulchowk, Lalitpur, Nepal",389-45382-93886/821590,GH789I,Male,93-12-35351480
3,4567890123,Surya Rai,44475489,1993-04-04,30,45475489,Gita Tamang,A-,Shyam Lal Tamang,"Biratnagar, Morang, Nepal",3248,Mina Rai,001.01.04,QRSTU3456M,gitatamang 1993-04-04 shyamlaltamang,"Bagbazar, Kathmandu, Nepal",571-38785-99733/440035,JK012L,Female,65-03-68139881
4,5678901234,Sarita Karki,55562139,1994-05-05,27,56562139,Mohan Lama,A+,Krishna Raj Lama,"Bharatpur, Chitwan, Nepal",3249,Kalpana Karki,001.01.05,VWXYZ7890N,mohanlama 1994-05-05 krishnarajlama,"Balkumari, Lalitpur, Nepal",864-17331-40021/961722,MN345O,Male,14-11-40056582
5,6789012345,Pemba Gurung,66698214,1995-06-06,41,67698214,Radha Magar,B-,Narayan Kumar Magar,"Butwal, Rupandehi, Nepal",3250,Nima Gurung,001.01.06,ABCD1234PQ,radhamagar 1995-06-06 narayankumarmagar,"Chabahil, Kathmandu, Nepal",033-51347-62581/380746,PQ678R,Female,82-09-81734599
6,7890123456,Laxmi Shrestha,77714635,1996-07-07,36,78714635,Krishna Rai,AB+,Govinda Bahadur Rai,"Hetauda, Makwanpur, Nepal",3251,Saru Shrestha,001.01.07,EFGH5678RS,krishnarai 1996-07-07 govindabahadurrai,"Kumaripati, Lalitpur, Nepal",068-37653-84341/852787,ST901U,Male,26-03-50185868
7,8901234567,Raj Maharjan,88847326,1997-08-08,50,89847326,Sarita Sherpa,A-,Shiva Narayan Sherpa,"Janakpur, Dhanusa, Nepal",3252,Nanu Maharjan,001.01.08,TUVW9012XY,saritasherpa 1997-08-08 shivanarayansherpa,"New Baneshwor, Kathmandu, Nepal",323-37869-95909/623481,VW234X,Female,98-03-72394228
8,9012345678,Pema Tamang,99953421,1998-09-09,43,91953421,Bikash Karki,B-,Bhagirath Bahadur Karki,"Nepalgunj, Banke, Nepal",3253,Lhamu Tamang,001.01.09,ZABC3456DE,bikashkarki 1998-09-09 bhagirathbahadurkarki,"Boudha, Kathmandu, Nepal",406-94259-70142/249811,YZ567A,Male,68-01-80388982
9,123456789,Manish KC,10103847,1999-10-10,34,10203847,Nisha Bhandari,A-,Surya Bahadur Bhandari,"Dharan, Sunsari, Nepal",3254,Lila KC,001.01.10,FGHI7890JK,nishabhandari 1999-10-10 suryabahadurbhandari,"Kalanki, Kathmandu, Nepal",272-28301-42325/881177,BC890D,Female,42-08-02614125
