In [128]:
import pandas as pd
import numpy as np
import os


In [129]:
def process_textfile(file_path: str) -> pd.DataFrame:
    # Create a dictionary to store the data
    data = {"Area": [], "Fog": [], "Source": []}

    # Read the text file
    with open(file_path, "r") as f:
        lines = [line.strip() for line in f]
        
    area = ""

    # Iterate through the lines
    for line in lines:
        # Check if the line starts with a number
        if line[0].isdigit():
            # Remove the number and period, then strip whitespace
            area = line.split(". ", 1)[1].strip()
            # Append the area to the areas list
            # areas.append(area)
        # Check if the line starts with a '-' sign
        elif line[0] == '-':
            # Remove the '-' sign and strip whitespace
            fog = line[1:].strip()
            
            # Append the area and its fog to the data dictionary
            data["Area"].append(area)
            data["Fog"].append(fog)
            data["Source"].append(file_path[:-4])

    # Create a pandas DataFrame from the data dictionary
    df = pd.DataFrame(data)

    # Print the DataFrame
    # df.head()
    
    return df

In [130]:
# List all files in the current folder
all_files = os.listdir()

# Filter out the ones with the ".txt" extension
text_files = [file for file in all_files if file.endswith(".txt")]

# Print the list of file paths
print(text_files)

['Belfries.txt', 'Boss evergaols.txt', 'Colosseum entrances.txt', 'Critical path.txt', 'Divine towers.txt', 'Legacy dungeons.txt', 'Major bosses.txt', 'Mini dungeons.txt', 'Minor sending gates.txt', 'Stranded graveyard.txt', 'Underground cities.txt']


In [131]:
# Process the text files and combine the resulting DataFrames
dataframes = [process_textfile(file_path) for file_path in text_files]
combined_df = pd.concat(dataframes, ignore_index=True)

# print(combined_df)
combined_df.head()

Unnamed: 0,Area,Fog,Source
0,Chapel of Anticipation - Grafted Scion,at the front of Grafted Scion's arena,Belfries
1,Chapel of Anticipation - Grafted Scion,at the side exit of Grafted Scion's arena,Belfries
2,Chapel of Anticipation - after Grafted Scion,after Grafted Scion's arena,Belfries
3,Liurnia,using the 2nd Belfry labeled Precipice of Anti...,Belfries
4,Liurnia,using the 3rd Belfry labeled Night Sky Unceasing,Belfries


In [132]:
# Sort the combined_df DataFrame by the "Area" column
sorted_df = combined_df.sort_values(by="Area").reset_index(drop=True)

# Print the sorted DataFrame
sorted_df.head()

Unnamed: 0,Area,Fog,Source
0,Above Ainsel River Downstream,after the tunnel from Ainsel River Main,Underground cities
1,Academy of Raya Lucaria,getting abducted at the bottom of the elevator...,Legacy dungeons
2,Academy of Raya Lucaria,before Red Wolf's arena,Major bosses
3,Academy of Raya Lucaria - After Academy Crysta...,after Academy Crystal Cave boss,Mini dungeons
4,Academy of Raya Lucaria - Grand Library,at the front of Rennala's arena,Major bosses


In [133]:
sorted_df.to_excel("sorted_data.xlsx", index=False, engine='openpyxl')

In [134]:
area_keywords = {
    'Limgrave': ['Limgrave', 'Stormhill', 'Knowledge', 'Chapel of Anticipation', 'Stranded Graveyard', 'Beastman', 'Godrick', 'Stormveil', 'Margit'], 
    'Weeping Peninsula': ['Weeping Peninsula', 'Leonine Misbegotten'], 
    'Liurnia of the Lakes': ['Liurnia', 'Academy of Raya Lucaria', 'Makar', 'Bellum', 'Carian', 'Moonlight', 'Red Wolf of Radagon', 'Royal Knight', 'Ruin-Strewn'], 
    'Caelid': ['Redmane', 'Radahn', 'Caelid', 'Dragonbarrow'], 
    'Altus Plateau': ['Altus Plateau', 'Altus Tunnel', 'Capital Outskirts', 'Shaded Castle'], 
    'Mt. Gelmir': ['Mt. Gelmir', 'Volcano Manor', 'Rykard'], 
    'Leyndell, Ashen Capital': ['Ashen'],
    'Leyndell, Royal Capital': ['Royal Capital', 'Leyndell', 'Subterranean Shunning-Grounds', 'Divine Tower of East Altus', 'Divine Tower of West Altus', 'the Omen'],
    'Mountaintops of the Giants': ['Mountaintops', 'Niall', 'Sol', 'Fire Giant', 'Flame Peak', 'Forbidden'],
    'Consecrated Snowfield': ['Consecrated Snowfield'],
    "Miquella's Haligtree": ['Haligtree', 'Malenia'],
    'Crumbling Farum Azula': ['Farum Azula', 'Dragon Temple', 'Godskin Duo', 'Maliketh'],
    'Ainsel River': ['Ainsel', 'Nokstella'],
    'Nokron, Eternal City': ['Nokron', 'Regal', 'Valiant Gargoyles', 'Mimic Tear'],
    'Siofra River': ['Siofra', 'Ancestor Spirit'],
    'Lake of Rot': ["Lake of Rot", 'Astel, Naturalborn'],
    'Mohgwyn Palace': ['Lord of Blood', 'Mohgwyn'],
    'Deeproot Depths': ['Deeproot Depths'],
    'Roundtable': ['Roundtable']
}

output_dir = 'output/'

In [135]:
for key, keywords in area_keywords.items():
    # Iterate over the unique values in the "Area" column and check if they contain any keywords
    areas_to_keep = [area for area in sorted_df["Area"].unique() if any(keyword in area for keyword in keywords)]

    area_keywords[key] = areas_to_keep
    
print(area_keywords)

{'Limgrave': ['Cave of Knowledge', 'Cave of Knowledge - After Soldier of Godrick', 'Cave of Knowledge - From Seaside Ruins', 'Cave of Knowledge - Soldier of Godrick', 'Chapel of Anticipation', 'Chapel of Anticipation - Grafted Scion', 'Chapel of Anticipation - after Grafted Scion', 'Divine Tower of Limgrave', 'Godrick the Grafted', 'Limgrave', 'Limgrave - Church of Dragon Communion', 'Limgrave - Coastal Cave', 'Limgrave - Coastal Cave - Demi-Human Chiefs', "Limgrave - Fringefolk Hero's Grave", "Limgrave - Fringefolk Hero's Grave - Ulcerated Tree Spirit", 'Limgrave - Groveside Cave', 'Limgrave - Groveside Cave - Beastman of Farum Azula', 'Limgrave - Highroad Cave', 'Limgrave - Highroad Cave - Guardian Golem', 'Limgrave - Murkwater Catacombs', 'Limgrave - Murkwater Catacombs - Grave Warden Duelist', 'Limgrave - Murkwater Cave', 'Limgrave - Murkwater Cave - Patches', 'Limgrave - Stormfoot Catacombs', 'Limgrave - Stormfoot Catacombs - Erdtree Burial Watchdog', 'Limgrave - Waypoint Ruins - 

In [136]:
for key, areas_to_keep in area_keywords.items():    
    # Filter the sorted_df DataFrame based on the areas_to_keep list
    filtered_df = sorted_df[sorted_df["Area"].isin(areas_to_keep)]
    # print(filtered_df.head())
    
    # print(key)
    filtered_df.to_excel(f"{output_dir}{key}.xlsx", index=False, engine='openpyxl')

    # Drop the filtered_df entries from the original sorted_df
    sorted_df = sorted_df.drop(filtered_df.index)

# Print the remaining DataFrame
print(f"\nShould be empty:")
print(sorted_df)



Should be empty:
Empty DataFrame
Columns: [Area, Fog, Source]
Index: []
