In [5]:
import pandas as pd
from fuzzywuzzy import fuzz, process
import re
from datetime import datetime

# Load XLS file 
xls_path = 'All_Concerts_1908_filtered.xlsx'
xls_df = pd.read_excel(xls_path)

# Display the first few rows of the loaded DataFrame to verify
#print(xls_df.head(10))

# Load JSON file
json_path = 'Datasets/SD_DN_AB_1908_concerts_LLM.json'
json_df = pd.read_json(json_path)

# Verify the loaded JSON data
#print(json_df.head(3))

# Identify relevant columns
relevant_columns = ['konsert_datum', 'konsert_namn', 'lokal_namn', 'konsert_biljettpris', 'konserttyp_namn', 'Producer']

# Filter relevant columns
xls_df = xls_df[relevant_columns]
json_df = json_df[relevant_columns]

# Function to normalize text
def normalize(text):
    if pd.isna(text) or text.strip().lower() == "unknown":  # Handle NaN values and "UNKNOWN"
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[-,]', ' ', text)  # Replace hyphens and commas with spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to normalize dates
def normalize_date(date):
    try:
        # Try to parse the date from different formats
        if isinstance(date, str):
            if re.match(r'\d{2}\.\d{2}\.\d{2}', date):  # Format: DD.MM.YY
                date_obj = datetime.strptime(date, '%d.%m.%y')
                if date_obj.year >= 2000:
                    date_obj = date_obj.replace(year=date_obj.year - 100)
            elif re.match(r'\d{4}\.\d{2}\.\d{2}', date):  # Format: YYYY.MM.DD
                date_obj = datetime.strptime(date, '%Y.%m.%d')
            elif re.match(r'\d{2}\.\d{2}\.\d{4}', date):  # Format: DD.MM.YYYY
                date_obj = datetime.strptime(date, '%d.%m.%Y')
            else:
                return ""
        elif isinstance(date, datetime):
            date_obj = date
        else:
            return ""
        return date_obj.strftime('%Y-%m-%d')  # Convert to "YYYY-MM-DD"
    except ValueError:
        return ""

# Normalize 'UNKNOWN' values and dates in both dataframes
xls_df = xls_df.replace('UNKNOWN', '')
json_df = json_df.replace('UNKNOWN', '')
xls_df['normalized_konsert_datum'] = xls_df['konsert_datum'].apply(normalize_date)
json_df['normalized_konsert_datum'] = json_df['konsert_datum'].apply(normalize_date)

# Normalize all text columns in both dataframes
for col in xls_df.columns:
    if xls_df[col].dtype == 'object':  # Apply only to text columns
        xls_df[col] = xls_df[col].apply(normalize)

for col in json_df.columns:
    if json_df[col].dtype == 'object':  # Apply only to text columns
        json_df[col] = json_df[col].apply(normalize)

# Columns to compare (excluding date)
columns_to_compare = ['konsert_namn', 'lokal_namn', 'konsert_biljettpris', 'konserttyp_namn', 'Producer']

# Assign weights to each column based on their importance
weights = {
    'konsert_namn': 0.4,  # 40% of the total importance
    'lokal_namn': 0.4, 
    'konsert_biljettpris': 0.1,
    'konserttyp_namn': 0.1,
    'Producer': 0.2
}

def compare_rows_on_date(date, df1, df2, columns, weights):
    rows1 = df1[df1['normalized_konsert_datum'] == date]
    rows2 = df2[df2['normalized_konsert_datum'] == date]
    comparison_results = []

    for idx2, row2 in rows2.iterrows():
        overall_match_score = 0
        best_match_idx1 = None
        best_match_date = None
        overall_match_scores = {}  # Ensure this is initialized before use

        for idx1, row1 in rows1.iterrows():
            total_weighted_score = 0
            column_scores = {}  # Initialize it here within the loop before use

            for col in columns:
                score = fuzz.token_sort_ratio(row1[col], row2[col])
                weighted_score = score * weights[col]
                total_weighted_score += weighted_score
                column_scores[col] = score

            if total_weighted_score > overall_match_score:
                overall_match_score = total_weighted_score
                best_match_idx1 = idx1
                best_match_date = row1['konsert_datum']
                overall_match_scores = column_scores  # Only set this if a new best score is found

        if best_match_idx1 is not None:  # Ensure that we have a valid match before appending
            comparison_results.append({
                'original_date': best_match_date,  # Include the original date in the result
                'json_row_index': idx2,
                'xls_row_index': best_match_idx1,
                'overall_match_score': overall_match_score,
                **overall_match_scores  # Safely add scores as they are now always initialized
            })

    return comparison_results


# Group rows by normalized date and compare
all_comparison_results = []
unique_dates = pd.concat([xls_df['normalized_konsert_datum'], json_df['normalized_konsert_datum']]).unique()

for date in unique_dates:
    if date:  # Ensure date is not empty
        comparison_results = compare_rows_on_date(date, xls_df, json_df, columns_to_compare, weights)
        all_comparison_results.extend(comparison_results)

# Analyze results
for result in all_comparison_results:
    # Updated the key to 'json_row_index' which is the correct key being used in the dictionary
    print(f"Row index in LLM data: {result['json_row_index']}")
    # Key 'best_match_index' is correct as per the creation of the dictionary
    print(f"Best match row index in human data: {result['xls_row_index']}")
    # Key 'overall_match_score' is also correct
    print(f"Best match score: {result['overall_match_score']}")
    # Print detailed scores for each column
    for col, score in result.items():
        if col in ['json_row_index', 'xls_row_index', 'overall_match_score', 'original_date']:
            continue  # Skip the non-score keys
        print(f"  {col}: {score}")
    print()  # Add a newline for better readability between entries


# Convert comparison results to DataFrame for further analysis
comparison_df = pd.DataFrame(all_comparison_results)

# Generate a date-based report name
report_date = datetime.now().strftime('%Y-%m-%d')

# Save the comparison results to a CSV file
comparison_file_name = f'detailed_comparison_results_{report_date}.csv'
comparison_df.to_csv(comparison_file_name, index=False)

# Correctly reference the DataFrame keys when looking for unique concerts
def find_unique_dates(df1, df2, threshold=80):
    comparison_df = pd.DataFrame(all_comparison_results)
    unique_to_llm = comparison_df[comparison_df['overall_match_score'] < threshold]
    matched_indices = comparison_df[comparison_df['overall_match_score'] >= threshold]['xls_row_index'].unique()
    unique_to_human = df1[~df1.index.isin(matched_indices)]
    return unique_to_llm, unique_to_human

# Find unique concerts based on dates
unique_llm_dates, unique_human_dates = find_unique_dates(xls_df, json_df)

# Correcting the column names for displaying unique concerts
print(f"\nUnique concerts found by LLM but not by human (total {len(unique_llm_dates)}):")
# Ensure that 'json_row_index' is used instead of 'index'
print(unique_llm_dates[['json_row_index', 'overall_match_score']])

print(f"\nUnique concerts found by human but not by LLM (total {len(unique_human_dates)}):")
print(unique_human_dates[['konsert_datum']])



# Saving results to CSV with the correct DataFrame structure
unique_llm_file_name = f'unique_to_llm_concerts_{report_date}.csv'
unique_human_file_name = f'unique_to_human_concerts_{report_date}.csv'
unique_llm_dates.to_csv(unique_llm_file_name, index=False)
unique_human_dates.to_csv(unique_human_file_name, index=False)


Row index in LLM data: 666
Best match row index in human data: 0
Best match score: 31.6
  konsert_namn: 23
  lokal_namn: 17
  konsert_biljettpris: 100
  konserttyp_namn: 56
  Producer: 0

Row index in LLM data: 668
Best match row index in human data: 0
Best match score: 37.2
  konsert_namn: 38
  lokal_namn: 17
  konsert_biljettpris: 100
  konserttyp_namn: 52
  Producer: 0

Row index in LLM data: 671
Best match row index in human data: 0
Best match score: 52.7
  konsert_namn: 30
  lokal_namn: 79
  konsert_biljettpris: 0
  konserttyp_namn: 55
  Producer: 18

Row index in LLM data: 433
Best match row index in human data: 1
Best match score: 75.30000000000001
  konsert_namn: 53
  lokal_namn: 100
  konsert_biljettpris: 55
  konserttyp_namn: 22
  Producer: 32

Row index in LLM data: 434
Best match row index in human data: 1
Best match score: 88.30000000000001
  konsert_namn: 66
  lokal_namn: 100
  konsert_biljettpris: 55
  konserttyp_namn: 100
  Producer: 32

Row index in LLM data: 488
Best 

In [6]:
import pandas as pd
from fuzzywuzzy import fuzz, process
import re
from datetime import datetime

# Load XLS file 
xls_path = 'All_Concerts_1908_filtered.xlsx'
xls_df = pd.read_excel(xls_path)

# Display the first few rows of the loaded DataFrame to verify
#print(xls_df.head(10))

# Load JSON file
json_path = 'Datasets/SD_DN_AB_1908_concerts_LLM.json'
json_df = pd.read_json(json_path)

# Verify the loaded JSON data
#print(json_df.head(3))

# Identify relevant columns
relevant_columns = ['konsert_datum', 'konsert_namn', 'lokal_namn', 'konsert_biljettpris', 'konserttyp_namn', 'Producer']

# Filter relevant columns
xls_df = xls_df[relevant_columns]
json_df = json_df[relevant_columns]

# Function to normalize text
def normalize(text):
    if pd.isna(text) or text.strip().lower() == "unknown":  # Handle NaN values and "UNKNOWN"
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[-,]', ' ', text)  # Replace hyphens and commas with spaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Function to normalize dates
def normalize_date(date):
    try:
        # Try to parse the date from different formats
        if isinstance(date, str):
            if re.match(r'\d{2}\.\d{2}\.\d{2}', date):  # Format: DD.MM.YY
                date_obj = datetime.strptime(date, '%d.%m.%y')
                if date_obj.year >= 2000:
                    date_obj = date_obj.replace(year=date_obj.year - 100)
            elif re.match(r'\d{4}\.\d{2}\.\d{2}', date):  # Format: YYYY.MM.DD
                date_obj = datetime.strptime(date, '%Y.%m.%d')
            elif re.match(r'\d{2}\.\d{2}\.\d{4}', date):  # Format: DD.MM.YYYY
                date_obj = datetime.strptime(date, '%d.%m.%Y')
            else:
                return ""
        elif isinstance(date, datetime):
            date_obj = date
        else:
            return ""
        return date_obj.strftime('%Y-%m-%d')  # Convert to "YYYY-MM-DD"
    except ValueError:
        return ""

# Normalize 'UNKNOWN' values and dates in both dataframes
xls_df = xls_df.replace('UNKNOWN', '')
json_df = json_df.replace('UNKNOWN', '')
xls_df['normalized_konsert_datum'] = xls_df['konsert_datum'].apply(normalize_date)
json_df['normalized_konsert_datum'] = json_df['konsert_datum'].apply(normalize_date)

# Normalize all text columns in both dataframes
for col in xls_df.columns:
    if xls_df[col].dtype == 'object':  # Apply only to text columns
        xls_df[col] = xls_df[col].apply(normalize)

for col in json_df.columns:
    if json_df[col].dtype == 'object':  # Apply only to text columns
        json_df[col] = json_df[col].apply(normalize)

# Columns to compare (excluding date)
columns_to_compare = ['konsert_namn', 'lokal_namn', 'konsert_biljettpris', 'konserttyp_namn', 'Producer']

# Assign weights to each column based on their importance
weights = {
    'konsert_namn': 0.4,  # 40% of the total importance
    'lokal_namn': 0.4, 
    'konsert_biljettpris': 0.1,
    'konserttyp_namn': 0.1,
    'Producer': 0.2
}

def compare_rows_on_date(date, df1, df2, columns, weights):
    rows1 = df1[df1['normalized_konsert_datum'] == date]
    rows2 = df2[df2['normalized_konsert_datum'] == date]
    comparison_results = []

    for idx2, row2 in rows2.iterrows():
        overall_match_score = 0
        best_match_idx1 = None
        best_match_date = None
        overall_match_scores = {}  # Ensure this is initialized before use

        for idx1, row1 in rows1.iterrows():
            total_weighted_score = 0
            column_scores = {}  # Initialize it here within the loop before use

            for col in columns:
                score = fuzz.token_sort_ratio(row1[col], row2[col])
                weighted_score = score * weights[col]
                total_weighted_score += weighted_score
                column_scores[col] = score

            if total_weighted_score > overall_match_score:
                overall_match_score = total_weighted_score
                best_match_idx1 = idx1
                best_match_date = row1['konsert_datum']
                overall_match_scores = column_scores  # Only set this if a new best score is found

        if best_match_idx1 is not None:  # Ensure that we have a valid match before appending
            comparison_results.append({
                'original_date': best_match_date,  # Include the original date in the result
                'json_row_index': idx2,
                'xls_row_index': best_match_idx1,
                'overall_match_score': overall_match_score,
                **overall_match_scores  # Safely add scores as they are now always initialized
            })

    return comparison_results


# Group rows by normalized date and compare
all_comparison_results = []
unique_dates = pd.concat([xls_df['normalized_konsert_datum'], json_df['normalized_konsert_datum']]).unique()

for date in unique_dates:
    if date:  # Ensure date is not empty
        comparison_results = compare_rows_on_date(date, xls_df, json_df, columns_to_compare, weights)
        all_comparison_results.extend(comparison_results)

# Analyze results
for result in all_comparison_results:
    # Updated the key to 'json_row_index' which is the correct key being used in the dictionary
    print(f"Row index in LLM data: {result['json_row_index']}")
    # Key 'best_match_index' is correct as per the creation of the dictionary
    print(f"Best match row index in human data: {result['xls_row_index']}")
    # Key 'overall_match_score' is also correct
    print(f"Best match score: {result['overall_match_score']}")
    # Print detailed scores for each column
    for col, score in result.items():
        if col in ['json_row_index', 'xls_row_index', 'overall_match_score', 'original_date']:
            continue  # Skip the non-score keys
        print(f"  {col}: {score}")
    print()  # Add a newline for better readability between entries


# Convert comparison results to DataFrame for further analysis
comparison_df = pd.DataFrame(all_comparison_results)

# Generate a date-based report name
report_date = datetime.now().strftime('%Y-%m-%d')

# Save the comparison results to a CSV file
comparison_file_name = f'detailed_comparison_results_{report_date}.csv'
comparison_df.to_csv(comparison_file_name, index=False)

# Correctly reference the DataFrame keys when looking for unique concerts
def find_unique_dates(df1, df2, threshold=80):
    comparison_df = pd.DataFrame(all_comparison_results)
    unique_to_llm = comparison_df[comparison_df['overall_match_score'] < threshold]
    matched_indices = comparison_df[comparison_df['overall_match_score'] >= threshold]['xls_row_index'].unique()
    unique_to_human = df1[~df1.index.isin(matched_indices)]
    return unique_to_llm, unique_to_human

# Find unique concerts based on dates
unique_llm_dates, unique_human_dates = find_unique_dates(xls_df, json_df)


print(f"\nUnique concerts found by LLM but not by human (total {len(unique_llm_dates)}):")
print(unique_llm_dates[['json_row_index', 'overall_match_score']])

print(f"\nUnique concerts found by human but not by LLM (total {len(unique_human_dates)}):")
print(unique_human_dates[['konsert_datum']])


# Saving results to CSV with the correct DataFrame structure
unique_llm_file_name = f'unique_to_llm_concerts_{report_date}.csv'
unique_human_file_name = f'unique_to_human_concerts_{report_date}.csv'
unique_llm_dates.to_csv(unique_llm_file_name, index=False)
unique_human_dates.to_csv(unique_human_file_name, index=False)


Row index in LLM data: 666
Best match row index in human data: 0
Best match score: 31.6
  konsert_namn: 23
  lokal_namn: 17
  konsert_biljettpris: 100
  konserttyp_namn: 56
  Producer: 0

Row index in LLM data: 668
Best match row index in human data: 0
Best match score: 37.2
  konsert_namn: 38
  lokal_namn: 17
  konsert_biljettpris: 100
  konserttyp_namn: 52
  Producer: 0

Row index in LLM data: 671
Best match row index in human data: 0
Best match score: 52.7
  konsert_namn: 30
  lokal_namn: 79
  konsert_biljettpris: 0
  konserttyp_namn: 55
  Producer: 18

Row index in LLM data: 433
Best match row index in human data: 1
Best match score: 75.30000000000001
  konsert_namn: 53
  lokal_namn: 100
  konsert_biljettpris: 55
  konserttyp_namn: 22
  Producer: 32

Row index in LLM data: 434
Best match row index in human data: 1
Best match score: 88.30000000000001
  konsert_namn: 66
  lokal_namn: 100
  konsert_biljettpris: 55
  konserttyp_namn: 100
  Producer: 32

Row index in LLM data: 488
Best 

In [10]:
# Extract JSON matching info from the report and turn it into a table

# Create a DataFrame to store detailed matching information
detailed_matching_info = []

# Iterate over all comparison results to gather detailed matching information
for result in all_comparison_results:
    print(f"Row index in LLM data: {result['json_row_index']}")
    print(f"Best match row index in human data: {result['xls_row_index']}")
    print(f"Best match score: {result['overall_match_score']}")
    
    # Prepare a dictionary to hold details for this particular match
    match_details = {
        'json_row_index': result['json_row_index'],
        'xls_row_index': result['xls_row_index'],
        'overall_match_score': result['overall_match_score']
    }

    # Collect scores for each column that was compared
    for col, score in result.items():
        if col in ['json_row_index', 'xls_row_index', 'overall_match_score', 'original_date']:
            continue
        print(f"  {col}: {score}")
        match_details[col] = score  # Add each score to the match details
    
    # Append the collected match details to the list
    detailed_matching_info.append(match_details)
    print()

# Convert the list to a DataFrame
detailed_matching_df = pd.DataFrame(detailed_matching_info)

# Display the first few rows of the detailed matching DataFrame
print(detailed_matching_df.head())

## Find the best match by identifying the max 'overall_match_score'
best_match = detailed_matching_df[detailed_matching_df['overall_match_score'] == detailed_matching_df['overall_match_score'].max()]

# Display the best match details
if not best_match.empty:
    print("Best Match Details:")
    display(best_match)  # Using 'display' for better formatting in Jupyter Notebook
else:
    print("No matches found or no scores available.")

# Optionally, highlight this in the CSV output
detailed_matching_file_name = f'detailed_matching_info_{report_date}.csv'
detailed_matching_df.to_csv(detailed_matching_file_name, index=False)

# Show this best match separately in a highlighted form if required
highlighted_best_match_file_name = f'best_match_details_{report_date}.csv'
best_match.to_csv(highlighted_best_match_file_name, index=False)

Row index in LLM data: 666
Best match row index in human data: 0
Best match score: 31.6
  konsert_namn: 23
  lokal_namn: 17
  konsert_biljettpris: 100
  konserttyp_namn: 56
  Producer: 0

Row index in LLM data: 668
Best match row index in human data: 0
Best match score: 37.2
  konsert_namn: 38
  lokal_namn: 17
  konsert_biljettpris: 100
  konserttyp_namn: 52
  Producer: 0

Row index in LLM data: 671
Best match row index in human data: 0
Best match score: 52.7
  konsert_namn: 30
  lokal_namn: 79
  konsert_biljettpris: 0
  konserttyp_namn: 55
  Producer: 18

Row index in LLM data: 433
Best match row index in human data: 1
Best match score: 75.30000000000001
  konsert_namn: 53
  lokal_namn: 100
  konsert_biljettpris: 55
  konserttyp_namn: 22
  Producer: 32

Row index in LLM data: 434
Best match row index in human data: 1
Best match score: 88.30000000000001
  konsert_namn: 66
  lokal_namn: 100
  konsert_biljettpris: 55
  konserttyp_namn: 100
  Producer: 32

Row index in LLM data: 488
Best 

Unnamed: 0,json_row_index,xls_row_index,overall_match_score,konsert_namn,lokal_namn,konsert_biljettpris,konserttyp_namn,Producer
136,432,59,116.4,100,100,64,100,100
138,487,59,116.4,100,100,64,100,100


In [8]:
# Extract JSON matching info from the report and turn it into a table

# Create a DataFrame to store detailed matching information
detailed_matching_info = []

# Correct variable names and keys in printing and CSV output
for result in all_comparison_results:
    print(f"Row index in LLM data: {result['json_row_index']}")
    print(f"Best match row index in human data: {result['xls_row_index']}")
    print(f"Best match score: {result['overall_match_score']}")
    for col, score in result.items():
        if col in ['json_row_index', 'xls_row_index', 'overall_match_score', 'original_date']:
            continue
        print(f"  {col}: {score}")
    print()

# Convert the list to a DataFrame
detailed_matching_df = pd.DataFrame(detailed_matching_info)

# Display the first few rows of the detailed matching DataFrame
print(detailed_matching_df.head())

# Optionally, save the detailed matching info to a CSV file for further analysis
detailed_matching_file_name = f'detailed_matching_info_{report_date}.csv'
detailed_matching_df.to_csv(detailed_matching_file_name, index=False)

# Display the table in the notebook
detailed_matching_df


Row index in LLM data: 666
Best match row index in human data: 0
Best match score: 31.6
  konsert_namn: 23
  lokal_namn: 17
  konsert_biljettpris: 100
  konserttyp_namn: 56
  Producer: 0

Row index in LLM data: 668
Best match row index in human data: 0
Best match score: 37.2
  konsert_namn: 38
  lokal_namn: 17
  konsert_biljettpris: 100
  konserttyp_namn: 52
  Producer: 0

Row index in LLM data: 671
Best match row index in human data: 0
Best match score: 52.7
  konsert_namn: 30
  lokal_namn: 79
  konsert_biljettpris: 0
  konserttyp_namn: 55
  Producer: 18

Row index in LLM data: 433
Best match row index in human data: 1
Best match score: 75.30000000000001
  konsert_namn: 53
  lokal_namn: 100
  konsert_biljettpris: 55
  konserttyp_namn: 22
  Producer: 32

Row index in LLM data: 434
Best match row index in human data: 1
Best match score: 88.30000000000001
  konsert_namn: 66
  lokal_namn: 100
  konsert_biljettpris: 55
  konserttyp_namn: 100
  Producer: 32

Row index in LLM data: 488
Best 

# Trying a heatmap recommended by gpt4