# Notebook for working and clean functions / loops

### Identified **Next Steps** from this:
- For the chunks, use the min function and:
    - store indexes with no match at end of loop
    - create new chunks with only those indexes
    - call again the function to find new pais
    - repeat until no indexes left
- Find how to creates pairs of Portraits if dealing with mixed files (Ps and Ls)
    - Logic is probably like checking the type and if P it has to be followed by a P only if type(i-1) != P
- Find how sorting the data can help to go faster or create better pairs
- Try to create a ML model based on new table we can create from extracting 10k first pairs

In [3]:
# below loop will tell you how many cores your have in your CPU. 
# with mine (11) cores the function runs in 3-4mins
# if you have a windows try to check if you can leverage the power of your GPU?
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count
num_threads = cpu_count()
print(num_threads)

11


### Open the file

In [1]:
# Open the file
with open("../../data/1_binary_landscapes.txt", "r") as file:
    num_rows = int(file.readline())
    data = []

    for _ in range(num_rows):
        line = file.readline().strip().split()
        picture_type = line[0]
        num_tags = int(line[1])
        tags = line[2:]
        
        data.append((picture_type, num_tags, tags))

data_with_index = []
for index, row in enumerate(data):
    new_row = (index,) + row
    data_with_index.append(new_row)

sorted_data = sorted(data_with_index, key=lambda x: x[2])

### Create chunks

In [2]:
chunk_size = 100

# Divide the data into chunks
chunks = [sorted_data[i:i+chunk_size] for i in range(0, len(sorted_data), chunk_size)]

### Base loop

In [15]:
best_order = {}
processed_indices = set()


for i in range(len(sorted_data[:1000])):
    if i in processed_indices:
        continue

    common_tags = {}
    for j in range(i+1, len(sorted_data)):
        counter = 0
        for element in sorted_data[i][3]:
            if element in sorted_data[j][3]:
                counter += 1
                common_tags[j] = counter
    
    max_common_tags = max(common_tags.values(), default=0)
    if max_common_tags > 0:
        best_order[i] = max(common_tags, key=common_tags.get)
        processed_indices.add(best_order[i])

print(best_order)

{0: 28521, 1: 31633, 2: 18356, 3: 27091, 4: 76, 5: 56707, 6: 7482, 7: 18741, 8: 12396, 9: 6049, 10: 52640, 11: 51930, 12: 29539, 13: 54702, 14: 4348, 15: 7595, 16: 3186, 17: 1789, 18: 40358, 19: 3353, 20: 30442, 21: 9150, 22: 16154, 23: 22684, 24: 17309, 25: 26146, 26: 35510, 27: 52263, 28: 65050, 29: 27533, 30: 18011, 31: 46924, 32: 20946, 33: 53168, 34: 22793, 35: 40423, 36: 12017, 37: 47000, 38: 19569, 39: 699, 40: 29828, 41: 57617, 42: 12500, 43: 16866, 44: 2206, 45: 15492, 46: 40244, 47: 49419, 48: 1160, 49: 24867, 50: 29035, 51: 49513, 52: 55499, 53: 5980, 54: 895, 55: 26742, 56: 8754, 57: 13153, 58: 58725, 59: 19195, 60: 69016, 61: 26072, 62: 16878, 63: 16226, 64: 47736, 65: 46724, 66: 5753, 67: 3294, 68: 8103, 69: 8971, 70: 14017, 71: 24920, 72: 27154, 73: 16871, 74: 12375, 75: 28372, 77: 18997, 78: 1467, 79: 22666, 80: 17792, 81: 27289, 82: 24687, 83: 30729, 84: 7594, 85: 53305, 86: 31315, 87: 18408, 88: 29678, 89: 7592, 90: 54812, 91: 26739, 92: 1124, 93: 39535, 94: 31042, 95

### Process data to get the max tags in common

In [None]:
#process the max
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count

# Define a function to process a chunk of data within a specified range
def process_chunk_range(data_range):
    chunk_start, chunk_end = data_range
    chunk = data[chunk_start:chunk_end]
    chunk_best_order = {}
    for i, index in enumerate(range(chunk_start, chunk_end)):
        current_tags = set(chunk[i][2])
        common_tags_count = {}
        
        # Loop over subsequent rows to count common tags
        for j in range(i + 1, len(chunk)):
            next_tags = set(chunk[j][2])
            common_tags_count[j] = len(current_tags.intersection(next_tags))
        
        # Filter out zero values
        non_zero_values = [value for value in common_tags_count.values() if value != 0]
        if not non_zero_values:
            continue
        
        # Find the minimum common tags count
        max_common_tags = max(non_zero_values)
        
        # Find the index of the next row with the minimum common tags
        next_row_index = max(common_tags_count, key=lambda k: (common_tags_count[k] == max_common_tags, k))
        
        chunk_best_order[index] = next_row_index
    return chunk_best_order

# Define the chunk size and the number of worker threads
chunk_size = 8000
num_threads = cpu_count()

# Divide the data into ranges of indices
data_ranges = [(i, min(i + chunk_size, len(data))) for i in range(0, len(data), chunk_size)]

# Process each range in parallel using ThreadPoolExecutor
best_order = {}
import time
st = time.time()

with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_chunk_range, data_range) for data_range in data_ranges]
    for future in as_completed(futures):
        chunk_best_order = future.result()
        best_order.update(chunk_best_order)

et = time.time()
elapsed_time = round((et - st)/60,2)
print('Execution time:', elapsed_time, 'minutes')

### Process tags to have pairs with least in common

In [None]:
#process the min
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count

# Define a function to process a chunk of data within a specified range
def process_chunk_range(data_range):
    chunk_start, chunk_end = data_range
    chunk = data[chunk_start:chunk_end]
    chunk_best_order = {}
    for i, index in enumerate(range(chunk_start, chunk_end)):
        current_tags = set(chunk[i][2])
        common_tags_count = {}
        
        # Loop over subsequent rows to count common tags
        for j in range(i + 1, len(chunk)):
            next_tags = set(chunk[j][2])
            common_tags_count[j] = len(current_tags.intersection(next_tags))
        
        # Filter out zero values
        non_zero_values = [value for value in common_tags_count.values() if value != 0]
        if not non_zero_values:
            continue
        
        # Find the minimum common tags count
        min_common_tags = min(non_zero_values)
        
        # Find the index of the next row with the minimum common tags
        next_row_index = min(common_tags_count, key=lambda k: (common_tags_count[k] == min_common_tags, k))
        
        chunk_best_order[index] = next_row_index
    return chunk_best_order

# Define the chunk size and the number of worker threads
chunk_size = 8000
num_threads = cpu_count()

# Divide the data into ranges of indices
data_ranges = [(i, min(i + chunk_size, len(data))) for i in range(0, len(data), chunk_size)]

# Process each range in parallel using ThreadPoolExecutor
best_order = {}
import time
st = time.time()

with ThreadPoolExecutor(max_workers=num_threads) as executor:
    futures = [executor.submit(process_chunk_range, data_range) for data_range in data_ranges]
    for future in as_completed(futures):
        chunk_best_order = future.result()
        best_order.update(chunk_best_order)

et = time.time()
elapsed_time = round((et - st)/60,2)
print('Execution time:', elapsed_time, 'seconds')

### Process tags to have min in common but iterates until either data is empty or no pairs are being created

This code can process up to 60k rows out of 80k in 10mins, after 60k less and less pairs so taking a lot of time.

For now we can create as much pairs as we can, shuffle the rest and add in a random order to see score.

In [None]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from multiprocessing import cpu_count

df = sorted_data

# Define the function to process each chunk range
def process_chunk_range(data_range, data):
    chunk_start, chunk_end = data_range
    chunk = data[chunk_start:chunk_end]
    chunk_best_order = {}
    
    # Loop through each row in the chunk
    for i, index in enumerate(range(chunk_start, chunk_end)):
        current_tags = set(chunk[i][3])
        common_tags_count = {}
        
        # Loop over subsequent rows to count common tags
        for j in range(i + 1, len(chunk)):
            next_tags = set(chunk[j][3])
            common_tags_count[j] = len(current_tags.intersection(next_tags))
        
        # Filter out zero values
        non_zero_values = [value for value in common_tags_count.values() if value != 0]
        if not non_zero_values:
            continue
        
        # Find the minimum common tags count
        min_common_tags = min(non_zero_values)
        
        # Find the index of the next row with the minimum common tags
        next_row_index = min(common_tags_count, key=lambda k: (common_tags_count[k] == min_common_tags, k))
        
        # Update the chunk_best_order dictionary
        chunk_best_order[chunk[i][0]] = chunk[next_row_index][0]
    
    return chunk_best_order

# Main loop to process until df is empty
total_time = 0
processed_indexes = set()
all_best_order = {}
while True:
    # Define the chunk size and the number of worker threads
    chunk_size = 8000
    num_threads = cpu_count()

    # Divide the data into ranges of indices
    data_ranges = [(i, min(i + chunk_size, len(df))) for i in range(0, len(df), chunk_size)]

    # Process each range in parallel using ThreadPoolExecutor
    best_order = {}
    st = time.time()
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_chunk_range, data_range, df) for data_range in data_ranges]
        for future in as_completed(futures):
            chunk_best_order = future.result()
            best_order.update(chunk_best_order)

    et = time.time()
    elapsed_time = round((et - st)/60, 2)
    print('Execution time:', elapsed_time, 'minutes')
    total_time += elapsed_time

    # Add processed indexes to the set
    for key, value in best_order.items():
        processed_indexes.add(key)
        processed_indexes.add(value)

    # Remove processed rows from df in order
    df = [row for row in df if row[0] not in processed_indexes]

    # Save the current best_order to all_best_order
    all_best_order.update(best_order)

    # Check if any pairs were found
    if not best_order:
        break  # No pairs found, exit the loop

print("All data processed.")
print(f"Total Time needed: {total_time}")


### Write results to a file

In [None]:
total_length = len(processed_indexes) + len(df)
import random
random.shuffle(df)

shuffled_frames = {}
for i in range(len(df)):
    shuffled_frames[i] = df[i][0]

In [None]:
# Open the file for writing at the specified path
with open("../data/binary_landscapes_output_2.txt", "w") as f:

    keys_list = list(all_best_order.keys())
    f.write(str(total_length) + "\n")

    current_key = keys_list[0]
    i = 0

    while i < total_length and keys_list:
        # Write the current key's value to the file
        print(f"Writing key {current_key} to file")
        f.write(str(current_key) + "\n")
        next_key = all_best_order[current_key]
        keys_list.remove(current_key)
        if next_key in keys_list:
            current_key = next_key
        else:
            current_key = keys_list[0] if keys_list else None
        i += 1

with open("../data/binary_landscapes_output_2.txt", "a") as f:

    keys_list = list(shuffled_frames.keys())

    current_key = 0
    i = 0

    while keys_list:
        # Write the current key's value to the file
        print(f"Writing key {current_key} to file")
        f.write(str(current_key) + "\n")
        next_key = shuffled_frames[current_key]
        keys_list.remove(current_key)
        if next_key in keys_list:
            current_key = next_key
        else:
            current_key = keys_list[0] if keys_list else None

### remove duplicates from the file

In [None]:
# Open the file for reading
file_path = "/Users/julien/Documents/EPITA/S2/kaggle_week/data/binary_landscapes_output_2.txt"
with open(file_path, "r") as f:
    # Read all lines from the file
    lines = f.readlines()

# Remove duplicates by creating a set
unique_lines = set(lines)
len(unique_lines)

# Open the file again for writing (this will overwrite the existing file)
with open(file_path, "w") as f:
    # Write the unique lines back to the file
    f.write(str(len(unique_lines)) + "\n")
    f.writelines(unique_lines)

## /!\ STOP HERE /!\

In [21]:
# base loop, don't execute it runs for ever
while len(sorted_data)>0:
    for i in range(len(sorted_data[:1000])):

        common_tags = {}
        for j in range(i+1, len(sorted_data)):
            counter = 0
            for element in sorted_data[i][3]:
                if element in sorted_data[j][3]:
                    counter += 1
                    common_tags[j] = counter
        
        max_common_tags = min(common_tags.values(), default=0)
        if max_common_tags > 0:
            best_order[i] = max(common_tags, key=common_tags.get)
            
    print(f"this length of sorted_data before deletion: {len(sorted_data)}")
    del sorted_data[:1000]
    print(f"this length of sorted_data after deletion: {len(sorted_data)}")

this length of sorted_data before deletion: 75000
this length of sorted_data after deletion: 74000
this length of sorted_data before deletion: 74000
this length of sorted_data after deletion: 73000
this length of sorted_data before deletion: 73000
this length of sorted_data after deletion: 72000
this length of sorted_data before deletion: 72000
this length of sorted_data after deletion: 71000
this length of sorted_data before deletion: 71000
this length of sorted_data after deletion: 70000
this length of sorted_data before deletion: 70000
this length of sorted_data after deletion: 69000
this length of sorted_data before deletion: 69000
this length of sorted_data after deletion: 68000
this length of sorted_data before deletion: 68000
this length of sorted_data after deletion: 67000
this length of sorted_data before deletion: 67000
this length of sorted_data after deletion: 66000
this length of sorted_data before deletion: 66000
this length of sorted_data after deletion: 65000
this lengt