# Fuzzy Comparison between human-made and AI-made data

In [11]:
import pandas as pd

# Load XLS file with the date column explicitly set to string type
xls_path = 'All_Concerts_1908_filtered.xlsx'

# Load the Excel file, skipping the specified number of rows
xls_df = pd.read_excel(xls_path)

# Display the first few rows of the loaded DataFrame to verify
print(xls_df.head(10))

# Display the DataFrame to ensure dates are read as strings. Basically if you see dates here and not NaT, we're good.
#start_row_xls = 2090
#num_rows_to_display = 3  # Adjust as needed
#print("Excel DataFrame from row 2090:") # reason: the beginning of the data doesn't have dates in the date column.
#print(xls_df.iloc[start_row_xls:start_row_xls + num_rows_to_display])

   konsert_ID konsert_datum                         konsert_namn  \
0        2528    1908-01-02    Brüssel-kvartetten Beethovenafton   
1        2509    1908-01-05  1:sta säsångskonsert: Nordisk afton   
2        2521    1908-01-05                       Afskedskonsert   
3        2516    1908-01-05                         Folk-konsert   
4        2515    1908-01-05                       Populärkonsert   
5        2510    1908-01-06        2:a populära Symfoni-matinéen   
6        2511    1908-01-06                     Jul-fest med Bal   
7        2517    1908-01-06                        Populär soaré   
8        2519    1908-01-06                        Populär soaré   
9        2512    1908-01-07  2:a kammarmusikkonsert: Brahmsafton   

                                     lokal_namn  \
0         Kungliga musikaliska akademiens lokal   
1                    Hotel Continentals festsal   
2                               Immanuelskyrkan   
3  Arbetare-institutet, Klara Norra Kyrkogata 8

In [12]:

# Load JSON file
json_path = 'Datasets/Aftonbladet 1908-01-01-1908-12-31/JSON for Import/Aftonbladet_1908_01-01-03-31.json'
json_df = pd.read_json(json_path)

#Did this load right? If there's dates and whatever in here, we're good.
print(json_df.head(3))

  konsert_datum                                       konsert_namn  \
0      25.01.08                                    Middags-Konsert   
1      25.01.08  Hans Burmeister Orkesters Kaffe- och Aftonkonsert   
2      25.01.08                                       Stor Konsert   

         lokal_namn konsert_biljettpris  konserttyp_namn  \
0    Berns Salonger             UNKNOWN   Middagskonsert   
1       Hôtel Fenix             UNKNOWN  Orkesterkonsert   
2  Stora Eftersalen             UNKNOWN          Konsert   

                    Producer         custom_id  \
0  Etablissementets Orkester  dark-64165-1-2-1   
1           Burmeister, Hans  dark-64165-1-2-2   
2                    UNKNOWN  dark-64165-1-2-3   

                                       url konsert producer konsertar  
0  https://tidningar.kb.se/h1txn0wt1gprhrb     NaN      NaN       NaN  
1  https://tidningar.kb.se/h1txn0wt1gprhrb     NaN      NaN       NaN  
2  https://tidningar.kb.se/h1txn0wt1gprhrb     NaN      NaN  

# Now we normalise the text so that they can be compared

## Removing Columns that are irrelevant

In [13]:
# Identify relevant columns (adjust based on your specific dataset)
relevant_columns = ['konsert_datum','konsert_namn','lokal_namn','konsert_biljettpris','konserttyp_namn','Producer', 'url']

# Remove irrelevant columns from both DataFrames
xls_df = xls_df[relevant_columns]
json_df = json_df[relevant_columns]


KeyError: "['url'] not in index"

In [None]:
import pandas as pd
from fuzzywuzzy import process, fuzz
import re
from datetime import datetime

# Function to normalize text
def normalize(text):
    if pd.isna(text) or text.strip().lower() == "unknown":  # Handle NaN values and "UNKNOWN"
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[-,]', ' ', text)  # Replace hyphens and commas with spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to normalize dates
def normalize_date(date):
    try:
        # Try to parse the date from different formats
        if isinstance(date, str):
            if re.match(r'\d{2}\.\d{2}\.\d{2}', date):  # Format: DD.MM.YY
                date_obj = datetime.strptime(date, '%d.%m.%y')
                # Adjust for century
                if date_obj.year >= 2000:
                    date_obj = date_obj.replace(year=date_obj.year - 100)
            elif re.match(r'\d{4}\.\d{2}\.\d{2}', date):  # Format: YYYY.MM.DD
                date_obj = datetime.strptime(date, '%Y.%m.%d')
            elif re.match(r'\d{2}\.\d{2}\.\d{4}', date):  # Format: DD.MM.YYYY
                date_obj = datetime.strptime(date, '%d.%m.%Y')
            else:
                return ""
        elif isinstance(date, datetime):
            # If already a datetime object, just format it
            date_obj = date
        else:
            return ""
        return date_obj.strftime('%Y-%m-%d')  # Convert to "YYYY-MM-DD"
    except ValueError:
        return ""

# Normalize the date columns in both dataframes
xls_df['normalized_konsert_datum'] = xls_df['konsert_datum'].apply(normalize_date)
json_df['normalized_konsert_datum'] = json_df['konsert_datum'].apply(normalize_date)

# Normalize all text columns in both dataframes
for col in xls_df.columns:
    if xls_df[col].dtype == 'object':  # Apply only to text columns
        xls_df[col] = xls_df[col].apply(normalize)

for col in json_df.columns:
    if json_df[col].dtype == 'object':  # Apply only to text columns
        json_df[col] = json_df[col].apply(normalize)

# Function to identify potential duplicates based on date and other criteria
def identify_potential_duplicates(df, date_col, other_cols, threshold=90):
    duplicates = []
    for i, row in df.iterrows():
        for j, other_row in df.iterrows():
            if i != j:
                # Check date similarity
                date_similarity = fuzz.token_sort_ratio(row[date_col], other_row[date_col])
                # Check other columns similarity
                other_similarity = all(fuzz.token_sort_ratio(row[col], other_row[col]) > threshold for col in other_cols)
                if date_similarity > threshold and other_similarity:
                    duplicates.append(i)
                    break  # Once a duplicate is found, move to the next row
    return list(set(duplicates))

# Columns to compare (excluding date)
other_columns_to_compare = ['konsert_namn', 'lokal_namn', 'konsert_biljettpris', 'konserttyp_namn', 'Producer']

# Identify potential duplicates in LLM data
duplicate_indices = identify_potential_duplicates(json_df, 'normalized_konsert_datum', other_columns_to_compare)

# Extract duplicates and export to CSV
today = datetime.today().strftime('%Y-%m-%d')
excluded_concerts = json_df.loc[duplicate_indices]
excluded_concerts.to_csv(f'{today}_excluded_concerts_fuzzy_comparison.csv', index=False)

# Remove duplicates from json_df
json_df = json_df.drop(duplicate_indices)



KeyError: 'Arrangör'

# This version has a toggle between exact and fuzzy matches

In [None]:
# Toggle variable for match type
match_type = "exact"  # Set to "exact" for exact match, "fuzzy" for fuzzy match

# Exact Match Function
def exact_match(df1, df2):
    # Merge on normalized date
    merged_df = pd.merge(df1, df2, on='normalized_konsert_datum', suffixes=('_human', '_llm'))
    
    # Calculate similarities for other columns
    for col in other_columns_to_compare:
        merged_df[f'score_{col}'] = merged_df.apply(lambda row: fuzz.token_sort_ratio(row[f'{col}_human'], row[f'{col}_llm']), axis=1)
    
    # Print analytics
    for col in other_columns_to_compare:
        avg_score = merged_df[f'score_{col}'].mean()
        print(f"Average match score for {col}: {avg_score:.2f}")
    
    return merged_df

# Fuzzy Match Function
def fuzzy_match(df1, df2):
    # Function to find best matches for the 'normalized_konsert_datum' column
    def find_best_matches_for_dates(df1, df2, column_name='normalized_konsert_datum', threshold=80):
        matches = df2[column_name].apply(lambda x: process.extractOne(x, df1[column_name].dropna(), scorer=fuzz.token_sort_ratio) if x else ('No match', 0))
        df2[f'best_match_{column_name}'] = matches.apply(lambda x: x[0] if x else 'No match')
        df2[f'score_{column_name}'] = matches.apply(lambda x: x[1] if x else 0)
    
    # Apply fuzzy matching for 'normalized_konsert_datum'
    find_best_matches_for_dates(df1, df2)

    # Analyze results for 'normalized_konsert_datum'
    score_col = 'score_normalized_konsert_datum'
    avg_score = df2[score_col].mean()
    print(f"Average match score for konsert_datum: {avg_score:.2f}")

    # Identify rows with low match scores (below threshold)
    low_match_rows = df2[df2[score_col] < 80]
    print(f"Rows with low match scores for konsert_datum (total {len(low_match_rows)}):")
    print(low_match_rows[['konsert_datum', 'best_match_normalized_konsert_datum', score_col]])

# Toggle between exact and fuzzy match
if match_type == "exact":
    result_df = exact_match(xls_df, json_df)
elif match_type == "fuzzy":
    fuzzy_match(xls_df, json_df)

Average match score for konsert_namn: 36.31
Average match score for lokal_namn: 44.60
Average match score for konsert_biljettpris: 32.63
Average match score for Arrangör: 28.93
Average match score for konserttyp_namn: 61.05
Average match score for Producer: 26.04


In [None]:
# Print to verify that normalisation went ok
start_row_xls = 2090
num_rows_to_display = 3  # Adjust as needed
#print("Excel DataFrame from row 2090:") # reason: the beginning of the data doesn't have dates in the date column.
#print(xls_df.iloc[start_row_xls:start_row_xls + num_rows_to_display])
print(json_df.head(3))

  konsert_datum                      konsert_namn  \
0      15.02.08                  symfonikonserten   
1      14.02.08                populär sång afton   
3    21.02.1908  vivien cliartrcs afskeds konsert   

                                  lokal_namn konsert_biljettpris  \
0                           kungliga teatern                       
1        f.u.m:s hörsal birger jarlsgatan 35      1kr 50 öre 1kr   
3  kungliga musikaliska akademiens stora sal   2:50kr 2kr 1:50kr   

        Arrangör konserttyp_namn       Producer normalized_konsert_datum  
0   järnefelt a.  symfonikonsert  gellin eduard               1908 02 15  
1  lundberg anna     sångkonsert  lundberg anna               1908 02 14  
3  gellin eduard     solokonsert  gellin eduard                           


# Fuzzywuzzy comparison

In [None]:
from fuzzywuzzy import process, fuzz

# Assume the DataFrames xls_df and json_df are already loaded and normalized
# Columns to compare
columns_to_compare = [
    'normalized_konsert_datum', 'konsert_namn', 'lokal_namn', 
    'konsert_biljettpris', 'Arrangör', 'konserttyp_namn', 'Producer'
]

# Normalize all text columns in both dataframes
for col in columns_to_compare:
    if xls_df[col].dtype == 'object':  # Apply only to text columns
        xls_df[col] = xls_df[col].apply(normalize)

for col in columns_to_compare:
    if json_df[col].dtype == 'object':  # Apply only to text columns
        json_df[col] = json_df[col].apply(normalize)

# Function to find best matches for a single column, with handling for empty strings
def find_best_matches(df1, df2, column_name, threshold=80):
    matches = df2[column_name].apply(lambda x: process.extractOne(x, df1[column_name].dropna(), scorer=fuzz.token_sort_ratio) if x else ('No match', 0))
    df2[f'best_match_{column_name}'] = matches.apply(lambda x: x[0] if x else 'No match')
    df2[f'score_{column_name}'] = matches.apply(lambda x: x[1] if x else 0)

# Apply fuzzy matching for each column
for col in columns_to_compare:
    if col in xls_df.columns and col in json_df.columns:
        find_best_matches(xls_df, json_df, col)

# Analyze results
for col in columns_to_compare:
    score_col = f'score_{col}'
    avg_score = json_df[score_col].mean()
    print(f"Average match score for {col}: {avg_score:.2f}")
    
    # Identifying low match scores
    low_match_rows = json_df[json_df[score_col] < 80]
    print(f"Rows with low match scores for {col}:")
    print(low_match_rows[[col, f'best_match_{col}', score_col]])

# Optionally, you can save the results to a CSV for further analysis
json_df.to_csv('comparison_results.csv', index=False)


Average match score for normalized_konsert_datum: 84.31
Rows with low match scores for normalized_konsert_datum:
   normalized_konsert_datum best_match_normalized_konsert_datum  \
3                                                      No match   
6                                                      No match   
9                                                      No match   
25                                                     No match   
36                                                     No match   
43                                                     No match   
46                                                     No match   
54                                                     No match   

    score_normalized_konsert_datum  
3                                0  
6                                0  
9                                0  
25                               0  
36                               0  
43                               0  
46                      

## Find unique concerts for LLM

In [None]:
# Function to find unique concerts based on dates
def find_unique_dates(df1, df2, column_name='normalized_konsert_datum', threshold=80):
    # Concerts found by LLM but not by human
    unique_to_llm = df2[df2[f'score_{column_name}'] < threshold]
    # Concerts found by human but not matched by LLM
    matched_indices = df2[df2[f'score_{column_name}'] >= threshold][f'best_match_{column_name}'].index
    matched_entries = df1.loc[matched_indices]
    unique_to_human = df1[~df1.index.isin(matched_entries.index)]
    return unique_to_llm, unique_to_human

# Find unique concerts based on dates
unique_llm_dates, unique_human_dates = find_unique_dates(xls_df, json_df)

print(f"\nUnique concerts found by LLM but not by human (total {len(unique_llm_dates)}):")
print(unique_llm_dates[['konsert_datum', 'best_match_normalized_konsert_datum', score_col]])

print(f"\nUnique concerts found by human but not by LLM (total {len(unique_human_dates)}):")
print(unique_human_dates[['konsert_datum']])

# Optionally, save the unique concert results to separate CSV files for further analysis
unique_llm_dates.to_csv('unique_to_llm_konsert_datum.csv', index=False)
unique_human_dates.to_csv('unique_to_human_konsert_datum.csv', index=False)


Unique concerts found by LLM but not by human (total 8):
        konsert_datum best_match_normalized_konsert_datum  score_Producer
3          21.02.1908                            No match             100
6                                                No match              46
9                                                No match              73
25                                               No match              53
36                                               No match             100
43  11.02.08 15.02.08                            No match              50
46                                               No match              48
54                                               No match              58

Unique concerts found by human but not by LLM (total 471):
    konsert_datum
2      1908-01-05
3      1908-01-05
4      1908-01-05
6      1908-01-06
9      1908-01-07
..            ...
509    1908-12-26
510    1908-12-26
511    1908-12-27
512    1908-12-31
513    1908-12-31
