In [139]:
import pandas as pd

def find_id_in_list(dict_list, target_id):
    for group_name, items in dict_list.items():
        for item in items:
            #print(f"{item['id']} --> {target_id}")
            if item['id'] == target_id:
                return item, group_name
    return None, None  # Return None if id is not found

def edit_distance(str1, str2):
    # Create a 2D list to store edit distances
    dp = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]

    # Initialize the first row and column
    for i in range(len(str1) + 1):
        dp[i][0] = i
    for j in range(len(str2) + 1):
        dp[0][j] = j

    # Fill in the rest of the dp table
    for i in range(1, len(str1) + 1):
        for j in range(1, len(str2) + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]  # No operation needed
            else:
                dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])

    # The bottom-right corner of the dp table contains the edit distance
    edit_dist = dp[len(str1)][len(str2)]
    
    # Normalize the edit distance between 0 and 1
    max_len = max(len(str1), len(str2))
    normalized_edit_dist = 1 - (edit_dist / max_len)
    
    return normalized_edit_dist

def jaccard_similarity(str1, str2):
    # Convert strings to sets of characters (or tokens)
    set1 = set(str1)
    set2 = set(str2)
    
    # Compute intersection and union of the sets
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    
    # Calculate Jaccard similarity
    if len(union) == 0:
        return 0.0  # Handle the case where both sets are empty
    else:
        jaccard_score = len(intersection) / len(union)
        return jaccard_score


In [None]:
cea_file = './my-data/Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'
mentions = {}
chunk_size = 1000
column_names = ["table_name", "row", "col", "url"] 

for chunk_cea in pd.read_csv(cea_file, chunksize=chunk_size):
        chunk_cea.columns = column_names
        for _, row in chunk_cea.iterrows():
            parts = row['url'].split('/')
            wikidata_id = parts[-1]
            
            # find the mention in the table
            table_file = './my-data/Dataset/Dataset/Round1_T2D/tables/' + row['table_name'] + '.csv'
            df = pd.read_csv(table_file)
            num_rows, num_columns = df.shape
            if row['row'] < num_rows and row['col'] < num_columns:
                cell_value = df.iloc[row['row'], row['col']]
                if cell_value not in mentions:
                    mentions[cell_value] = [row['url']]
                else:
                    mentions[cell_value].append(row['url'])

print(mentions)

In [None]:
GT_file = './my-data/dataset_GT/Round1_T2D_f3.csv'

chunk_size = 1000  # Adjust this based on your memory constraints

ids = {}
column_names = ["table_name", "row", "col", "url"] 

# Iterate over the CSV file in chunks
for chunk_GT in pd.read_csv(GT_file, chunksize=chunk_size):
    for _, row in chunk_GT.iterrows():
        if row['target'] == 1:
            ids['https://www.wikidata.org/entity/' + row['id']] = {"name": row['name'],"ed_score": row['ed_score'],"jaccard_score": row['jaccard_score']}


print(ids)

In [140]:
mapp_def = {}

for key, value in mentions.items():        
    tmp = []
    for el in value:
        try:
            if el not in mapp_def:
                #mapp_def[el] = [{ids[el]['name']: key, "ed_score": ids[el]['ed_score'],"jaccard_score": ids[el]['jaccard_score']}]
                mapp_def[el] = [{ids[el]['name']: key, "ed_score": edit_distance(ids[el]['name'], key),"jaccard_score": jaccard_similarity(ids[el]['name'], key)}]
            else:
                #mapp_def[el].append({ids[el]['name']: key, "ed_score": ids[el]['ed_score'],"jaccard_score": ids[el]['jaccard_score']})
                mapp_def[el].append({ids[el]['name']: key, "ed_score": edit_distance(ids[el]['name'], key),"jaccard_score": jaccard_similarity(ids[el]['name'], key)})
        except:
            pass
            

In [141]:
mapp_def

{'https://www.wikidata.org/entity/Q219795': [{'need for speed: carbon': 'Need for Speed: Hot Pursuit',
   'ed_score': 0.5555555555555556,
   'jaccard_score': 0.45},
  {'need for speed: carbon': 'Sim City',
   'ed_score': 0.045454545454545414,
   'jaccard_score': 0.05263157894736842},
  {'need for speed: carbon': 'Little Big Planet',
   'ed_score': 0.09090909090909094,
   'jaccard_score': 0.2},
  {'need for speed: carbon': 'Need for Speed ProStreet',
   'ed_score': 0.5416666666666667,
   'jaccard_score': 0.4117647058823529},
  {'need for speed: carbon': 'Wario Ware: Smooth Moves',
   'ed_score': 0.125,
   'jaccard_score': 0.3333333333333333}],
 'https://www.wikidata.org/entity/Q558664': [{'alex rodriguez': 'Alex Ochoa',
   'ed_score': 0.2857142857142857,
   'jaccard_score': 0.375},
  {'alex rodriguez': 'Cal Ripken Jr.',
   'ed_score': 0.0714285714285714,
   'jaccard_score': 0.3157894736842105}],
 'https://www.wikidata.org/entity/Q309048': [{'king kong': 'The King of Comedy',
   'ed_scor

In [102]:
count = 0
print("First 10 values of the dictionary:")
for key, value in mentions.items():
    print(f"{key}: {value}")
    count += 1
    if count == 10:
        break

First 10 values of the dictionary:
Need for Speed: Hot Pursuit: ['https://www.wikidata.org/entity/Q219795']
Alex Ochoa: ['https://www.wikidata.org/entity/Q558664']
The King of Comedy: ['https://www.wikidata.org/entity/Q309048']
Se7en: ['https://www.wikidata.org/entity/Q820753', 'https://www.wikidata.org/entity/Q5639157', 'https://www.wikidata.org/entity/Q6072725']
Ric Flair: ['https://www.wikidata.org/entity/Q115347']
Twisted Metal: ['https://www.wikidata.org/entity/Q597808', 'https://www.wikidata.org/entity/Q216995']
Avatar: ['https://www.wikidata.org/entity/Q183066', 'https://www.wikidata.org/entity/Q187278']
Mandrill: ['https://www.wikidata.org/entity/Q140']
Lac-a-l'eau-Claire: ['https://www.wikidata.org/entity/Q958928']
Ferris Bueller's Day Off: ['https://www.wikidata.org/entity/Q23395', 'https://www.wikidata.org/entity/Q23395', 'https://www.wikidata.org/entity/Q109767', 'https://www.wikidata.org/entity/Q217220']


In [120]:
count = 0
print("First 10 values of the dictionary:")
for key, value in ids.items():
    print(f"{key}: {value}")
    count += 1
    if count == 10:
        break

First 10 values of the dictionary:
https://www.wikidata.org/entity/Q9626: {'name': 'uk conservative', 'ed_score': 0.8, 'jaccard_score': 0.5}
https://www.wikidata.org/entity/Q9630: {'name': 'labour', 'ed_score': 1.0, 'jaccard_score': 1.0}
https://www.wikidata.org/entity/Q6540612: {'name': 'liberal democrat', 'ed_score': 0.38, 'jaccard_score': 0.0}
https://www.wikidata.org/entity/Q10658: {'name': 'snp', 'ed_score': 1.0, 'jaccard_score': 1.0}
https://www.wikidata.org/entity/Q10691: {'name': 'plaid', 'ed_score': 0.2, 'jaccard_score': 0.0}
https://www.wikidata.org/entity/Q9669: {'name': 'green party', 'ed_score': 0.45, 'jaccard_score': 0.5}
https://www.wikidata.org/entity/Q161269: {'name': 'bnp', 'ed_score': 1.0, 'jaccard_score': 1.0}
https://www.wikidata.org/entity/Q10647: {'name': 'ukip', 'ed_score': 1.0, 'jaccard_score': 1.0}
https://www.wikidata.org/entity/Q6393870: {'name': 'keri hehn', 'ed_score': 0.82, 'jaccard_score': 0.67}
https://www.wikidata.org/entity/Q237: {'name': 'holy see', 

In [68]:
ids

{'https://www.wikidata.org/entity/Q9626': {'name': 'uk conservative',
  'ed_score': 0.8,
  'jaccard_score': 0.5},
 'https://www.wikidata.org/entity/Q9630': {'name': 'labour',
  'ed_score': 1.0,
  'jaccard_score': 1.0},
 'https://www.wikidata.org/entity/Q6540612': {'name': 'liberal democrat',
  'ed_score': 0.38,
  'jaccard_score': 0.0},
 'https://www.wikidata.org/entity/Q10658': {'name': 'snp',
  'ed_score': 1.0,
  'jaccard_score': 1.0},
 'https://www.wikidata.org/entity/Q10691': {'name': 'plaid',
  'ed_score': 0.2,
  'jaccard_score': 0.0},
 'https://www.wikidata.org/entity/Q9669': {'name': 'green party',
  'ed_score': 0.45,
  'jaccard_score': 0.5},
 'https://www.wikidata.org/entity/Q161269': {'name': 'bnp',
  'ed_score': 1.0,
  'jaccard_score': 1.0},
 'https://www.wikidata.org/entity/Q10647': {'name': 'ukip',
  'ed_score': 1.0,
  'jaccard_score': 1.0},
 'https://www.wikidata.org/entity/Q6393870': {'name': 'keri hehn',
  'ed_score': 0.82,
  'jaccard_score': 0.67},
 'https://www.wikidata

In [None]:
import pandas as pd

# Specify the path to your CSV file
csv_file_path = './Dataset/Dataset/Round1_T2D/gt/CEA_Round1_gt_WD.csv'

# Define the chunk size (number of rows per chunk)
chunk_size = 100  # Adjust this based on your memory constraints

# Initialize an empty list to store processed chunks
chunk_list = []

# Iterate over the CSV file in chunks
first_chunk = pd.read_csv(csv_file_path, nrows=1)
column_names = ["table_name", "row", "col", "url"]
result_df = pd.DataFrame()

for chunk in pd.read_csv(csv_file_path, chunksize=chunk_size):
    
    result_df = pd.concat([result_df, chunk], ignore_index=True)
    result_df.columns = column_names
    print(result_df.columns)
    break
    print(chunk.iloc[:,0])   # name of the table file
    print(chunk.iloc[:,1])   # row of the table in the middle
    print(chunk.iloc[:,2])   # col of the table in the middle
    print(chunk.iloc[:,3])   #  Qid for the GT
    break

# Display the combined DataFrame or perform further operations
#print(combined_df.iloc[:, 2:10])  # Display the first few rows of the combined DataFrame
#print(combined_df[['name','id','target']])
