In [None]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import process, fuzz

In [None]:
# Try with 'latin1' encoding
try:
    df_svsim = pd.read_csv("svsim_texture_data.csv", encoding='latin1')
except UnicodeDecodeError:
    # If 'latin1' fails, try with 'cp1252' encoding
    df_svsim = pd.read_csv("svsim_texture_data.csv", encoding='cp1252')

# Assuming df4 is the DataFrame where you want to rename the column
df_svsim.rename(columns={'X': 'UTMX','Y': 'UTMY', 'TOP_BGS':'INTERVALSTART',
              'BASE_BGS':'INTERVALEND', 'LITH_DESC': 'DESCRIPTION' }, inplace=True)

# Display the first few rows to confirm the renaming
df_svsim.head()

In [None]:
# Check if the 'DESCRIPTION' column exists
if 'DESCRIPTION' not in df_svsim.columns:
    raise KeyError("The 'DESCRIPTION' column does not exist in the DataFrame.")

In [None]:
# Replace typos in the 'DESCRIPTION' column of the DataFrame
df_svsim['DESCRIPTION'] = df_svsim['DESCRIPTION'].replace({
    'mudstrone': 'mudstone',
    'sandsttone': 'sandstone',
    'Hard Pan': 'hardpan'
}, regex=True)


In [None]:
# Define the USCS codes and known USCS names globally
uscs_keywords_keep = [
    '(ML)', '(GP)', '(SP-SM)', '(GC)', '(CL)', '(CH)', '(MH)', '(GP-GM)', '(CL/ML)', '(GW/GM)',
    '(GP-GC)', '(ML-SM)', '(SP-SM)', '(SC-SM)', '(SC)', '(SW/GW)', '(SM/GM)', '(GP-SP)',
    '(CL/SM)', '(ML/GW)', '(SP & GP)', '(GW/GC)', '(GW-GM/GW)', '(SP-SC)','(SP/SM)', '(SM/SW)','(SP/GP/COBL)',
]

# Function to extract USCS codes from text within parentheses and direct codes
def extract_uscs(text):
    if isinstance(text, str):
        text_upper = text.upper()  # Convert text to uppercase for case insensitivity

        # Check for USCS code within parentheses
        match = re.search(r'\((.*?)\)', text_upper)
        if match:
            code_in_parentheses = f"({match.group(1).strip()})"
            if code_in_parentheses in uscs_codes:
                cleaned_text = text[:match.start()].strip() + " " + text[match.end():].strip()
                return code_in_parentheses, cleaned_text.strip()

        # Check if the text itself is a valid USCS code
        if text_upper in uscs_keywords_keep:
            return text_upper, text

    return 'unknown', text

In [None]:
# Define a dictionary to convert inaccurate USCS 
USCS_conversions = {
    'SHLE': [ 'shale'],
    'CLSN':['claystone','mudstone'],
    'STST':['slst','siltstone'],
    'TPSL':['topsoil', 'soil/organic','tp'],
    'VFRG':['volcanic frags'],
    'CONG':['conglomerate'],
    'COBL':['cobbles'],
    'SDST':['sandstone'],
    'TUFF':['tuff'],
    'ASH':['ash'],
    'GW/SW':['other-fine'],
    'CH/ML':['other-coarse'],
    'FRAC':['FRCT'],
}

# Define a function to apply USCS conversions (case-insensitive)
def convert_uscs(uscs_value, conversions):
    # Ensure the value is a string
    if isinstance(uscs_value, str):  
        uscs_value_lower = uscs_value.lower()  # Convert the USCS value to lowercase
        for key, values in conversions.items():
            # Check if the USCS value matches any of the dictionary values (case-insensitive)
            if uscs_value_lower in [v.lower() for v in values]:
                return key  # Return the correct USCS classification
    return uscs_value  # If no match, return the original value

# Apply the conversion function to the 'USCS' column in the df_svsim DataFrame
df_svsim['USCS'] = df_svsim['USCS'].apply(lambda x: convert_uscs(x, USCS_conversions))

In [None]:
# Define qualifiers
color_qualifiers = ['red', 'green', 'black', 'brown', 'gray', 'grayish', 'white','greenish',
                    'reddish', 'yellow', 'dark', 'light', 'tan', 'colored', 'blue','brownblack',
                   'yellowish','purple', 'orange','brw.','grey','(blue)', 'red,'  ]

texture_qualifiers = ['loose', 'hard', 'coarse', 'fine', 'compacted', 'cemented','crushed',
           'salt & pepper','Minor','medium', 'large','firm', 'small', 'fracture', 'frac','little',
           'fractured', 'soft', 'minor', 'eroded','tight','broken', 'brittle','chunky', 'crusty',
           'med.','packed', 'brittle', 'porous', 'pea', 'welded', 'mixed', 'softer', 'joint',
           'chunky','large', 'big','solid','firm','hard','heavy','very stiff', '(solid)', '(cement)',
           'laminated','poorly graded', 'well graded', 'sticky', 'grained','graded', 'med','dry' ,   
            '(cemented)', '(set)', '(water)','soft-med' ,'stiff' ,'crumbly' , 'granulated' ,'streaky',       
            'tough' ,'(varied)' ,  'gritty','holey', 'impervious' ,  'rubbery', 'rough','stringers',      
             'ashy', 'porous', '(balls)' ,'(tough)', '(hard drilling)','mottled', 'poorly',
             'no cementation', 'is grained', 'subrounded', 'no staining', 'no odor','with holes'       
              , '( water)' , 'water']

# Copy data from 'DESCRIPTION' to a new column 'TEXTURE'
df_svsim['TEXTURE'] = df_svsim['DESCRIPTION']

# Function to extract qualifiers from a string
def extract_qualifiers(description, qualifiers):
    if pd.isna(description):
        return []
    words = description.lower().split()
    return [word for word in words if word in qualifiers]

# Extract COLORQUALIFIER and TEXTUREQUALIFIER
df_svsim['COLORQUALIFIER_EXTRACTED'] = df_svsim.apply(lambda row: extract_qualifiers(row['TEXTURE'], color_qualifiers), axis=1)
df_svsim['TEXTUREQUALIFIER_EXTRACTED'] = df_svsim.apply(lambda row: extract_qualifiers(row['TEXTURE'], texture_qualifiers), axis=1)

# Convert lists to strings
df_svsim['COLORQUALIFIER_EXTRACTED'] = df_svsim['COLORQUALIFIER_EXTRACTED'].apply(lambda x: ' '.join(x) if x else np.nan)
df_svsim['TEXTUREQUALIFIER_EXTRACTED'] = df_svsim['TEXTUREQUALIFIER_EXTRACTED'].apply(lambda x: ' '.join(x) if x else np.nan)

# Function to remove qualifiers from a string
def remove_qualifiers(description, qualifiers):
    if pd.isna(description):
        return description
    words = description.lower().split()
    return ' '.join([word for word in words if word not in qualifiers])

# Create DESCRIPTION2 column
df_svsim['NEW_DESCRIPTION'] = df_svsim.apply(lambda row: remove_qualifiers(row['TEXTURE'], color_qualifiers + texture_qualifiers), axis=1)

# Handle missing 'TEXTUREMODIFIER1' column
if 'TEXTUREMODIFIER1' not in df_svsim.columns:
    df_svsim['TEXTUREMODIFIER1'] = np.nan

# Concatenate TEXTUREMODIFIER1 and DESCRIPTION if they are not equal, and include new columns
df_svsim['TEXTURE_MODIFIED'] = df_svsim.apply(
    lambda row: (row['TEXTUREMODIFIER1'] + ' ' if pd.notna(row['TEXTUREMODIFIER1']) and row['TEXTUREMODIFIER1'] != row['TEXTURE'] else '') + (row['DESCRIPTION'] if pd.notna(row['DESCRIPTION']) else ''),
    axis=1
)

# Update original COLORQUALIFIER and TEXTUREQUALIFIER columns
df_svsim['COLORQUALIFIER'] = df_svsim['COLORQUALIFIER_EXTRACTED']
df_svsim['TEXTUREQUALIFIER'] = df_svsim['TEXTUREQUALIFIER_EXTRACTED']

# Drop the intermediate columns if needed
df_svsim.drop(columns=['COLORQUALIFIER_EXTRACTED', 'TEXTUREQUALIFIER_EXTRACTED','TEXTURE','TEXTUREMODIFIER1','TEXTURE_MODIFIED',],inplace=True)

In [None]:
keywords_keep = {
    # Compound soil types
    'gravelly clay', 'sandy loam', 'silty clay', 'pebbly loam', 'cobbly sand', 'sandy mud',
    'clayey loam', 'sandy clay', 'silty loam', 'gravelly sand', 'pebbley sand','clayey wood',
    'cobbley clay', 'loamy sand', 'clayey gravel', 'gravelly loam', 'pebbley clay','gravelly wood',
    'sandy gravel', 'clayey sand', 'silty gravel', 'loamy gravel', 'silty sand','mucky mud',
    'gravelly silt', 'pebbley gravel', 'cobbley loam', 'clayey silt', 'gravelly clayey sand',
    'loamy clay', 'pebbley loamy', 'sandy silty', 'cobbley gravel', 'clayey sandy','cobbley sandy pebbles',
    'silty cobbly', 'gravelly pebbly', 'sandy cobble', 'gravelly sandy','gravelly cobbles','cobbley sandy pebbles',
    'rocky clay', 'rocky loam', 'rocky gravel', 'rocky silt','rocky sand','gravelly boulders','cobbley gravelly wood',
    'cobbley gravelly pebbles',
    # Sedimentary rocks
    'sandstone', 'conglomerate', 'shale', 'siltstone', 'limestone', 'cobblestone',  'mudstone', 
    
    # Soil classifications
    'silt', 'sand', 'gravel',  'clay', 'boulder', 'loam', 'cobble', 'gravels','cobbles','boulders','clays','mud', 'wood',
    'pebbles',

    # Soil descriptors
    'sticky clay', 'fat clay', 'lean clay', 'hardpan','pan', 'organic', 'adobe',  'weathered','poorly graded sand', 'well graded sand',
    'poorly graded gravel', 'well graded gravel',
    # Rocks and minerals
    'basalt', 'basaltic', 'pumice', 'latite', 'volcanics', 'volcanic', 'cinder', 'ash', 'lime',
    'tufa', 'tuff', 'lava', 'rhyolite', 'granite', 'diorite', 'quartz', 'gabbro', 'quartzite', 
    'granodiorite', 'igneous', 'andesite', 'greenstone', 'slate', 'schist', 'serpentine', 
    'metasediment', 'phyllite', 'argillite', 'bluestone', 'soapstone', 'chert','fractured',
    'fractured rock', 'gouge', 'hardrock', 'rock','bedrock','frac','tuscan','lapilli','limestone',

    # Other geological terms
    'sediment', 'alluvium', 'loam', 'peat', 'organics','topsoil','soil', 'organic'
}

In [None]:
# Select only the key words to convert into USCS
descrip = df_svsim['DESCRIPTION']

# Build the regex pattern
pattern = r'\b(?:' + '|'.join(map(re.escape, keywords_keep)) + r')\b'

# Apply the regex, handling NaN values by converting them to an empty string
extracted_words = [re.findall(pattern, str(d), re.IGNORECASE) for d in descrip]

# Add the extracted words as a new column
df_svsim['KEYWORDS_LIST'] = extracted_words

# Join the keywords into a single string
new_descriptions = df_svsim['KEYWORDS_LIST']
separator = ' ,'
d = []

for new_description in new_descriptions:
    new_descrip = separator.join(new_description)
    d.append(new_descrip.lower())

# Add the final keywords column
df_svsim['KEYWORDS'] = d

# Drop the intermediate column
df_svsim = df_svsim.drop(['KEYWORDS_LIST'], axis=1)

# Display the updated DataFrame
df_svsim.head()

In [None]:
classification_mapping = {
    # USCS Soil Classifications
    'CH': ['clay','clays', 'adobe clay', 'sticky clay', 'fat clay', 'adobe'],
    'CH-SC': ['clay sand'],
    'SC-GP': ['gravelly clayey sand'],
    'CL': ['lean clay', 'silty clay', 'sandy clay', 'gravelly clay',],
    'OH': ['organic clay'], 
    'TPSL': ['soil', 'top soil', 'topsoil', 'silty soil'],
    'GP': ['poorly graded gravel', 'gravel','cobbley gravelly pebbles','gravels'],
    'COBL': ['cobble', 'boulder','cobbles', 'boulders', 'pebbles'],
    'GW': ['well graded gravel', 'alluvium', 'pebbles', 'pebbley gravel', 'gravely cobbles', 'decomposed granite'],
    'SP': ['poorly graded sand', 'sand'],
    'SP-GP': ['gravelly sand', 'sandy gravel','sediment'],
    'SW': ['well graded sand', 'sandy'],
    'SW-GW': ['pebbley sand'],
    'SM-SC': ['sandy shale and sand'],
    'SM': ['silty sand'],
    'SC': ['clayey sand'],
    'ML': ['loam', 'clayey loam', 'hardpan','pan','silt', 'sandy silt', 'clayey silt', 'sandy shale'],
    'PT': ['muck', 'peat', 'organics', 'mud', 'mucky mud', 'wood'],
    'CL-PT': ['woody clay'],
    'GC': ['gravelly clayey'],
    'GM':['silty pebbles'],

    # Volcanic and Igneous Rocks
    'BSLT': ['basalt','andesite','latite', 'basaltic',],
    'VOLC': [ 'volcanics','volcanic'],
    'ASH': ['ash'],
    'LAVA': ['lava' ],
    'TUFF': ['tuff'],
    'VFRG': ['pumice'],    
    'IGNS': ['diorite', 'gabbro' ],
    'GRNT': ['granite', 'quartzite', 'granodiorite','quartz'],
    
    # Metamorphic Rocks
    'SCHT': ['slate', 'schist'],
    'META': ['greenstone',  'serpentine', 'phyllite', 'argillite', 'soapstone'],

    # Sedimentary Rocks
    'SDST': ['sandstone'],
    'CONG': ['conglomerate', 'cobblestone'],
    'SHLE': ['shale'], 
    'STST': ['siltstone'],
    'CLSN': ['mudstone'],
    'LMST': ['limestone','lime'],
    'LMST-CL': ['clayey lime'],

    # Double Porosity Rocks (fractured rocks)
    'FRAC': ['fractured', 'fracture', 'fractured rock'],

    # Miscellaneous Classifications
    'GP-OH': ['dirty gravel'],
    'SP-OH': ['dirty sand'],
    'OH': ['dirty top soil','mucky mud', 'organic'],
    'CL-GRNT': ['granitic clay'],
    'SP-GRNT': ['granitic sand'],
    'ROCK': ['rock' , 'chert','bedrock',],
}

In [None]:
# Updated function to extract USCS codes from text within parentheses and direct codes
def extract_uscs(text, current_uscs):
    # If there is already a value in the USCS column, return it
    if isinstance(current_uscs, str) and current_uscs.lower() != 'unknown':
        return current_uscs

    if isinstance(text, str):
        text = text.lower()  # Convert text to lower case
        text = re.sub(r'[^a-zA-Z0-9\- ,%]', '', text)  # Allow hyphens, commas, and percentages

        # Check if the cleaned text is empty
        if not text.strip():
            return 'unknown'  # If the cleaned text is empty, return 'unknown'

        # Tokenize the text into individual words or phrases
        tokens = text.split()

        # Track all matches
        matched_soils = []

        # Check for exact matches in the dictionary for each token
        for token in tokens:
            for key, synonyms in classification_mapping.items():
                if token in synonyms:
                    matched_soils.append(key.upper())

        # If we have any exact matches, return them (multiple can be combined)
        if matched_soils:
            return ','.join(set(matched_soils))  # Remove duplicates and join

        # If no exact match is found, try using rapidfuzz for approximate matching
        for token in tokens:
            best_match = process.extractOne(token, classification_mapping.keys(), scorer=fuzz.token_sort_ratio)
            if best_match and best_match[1] > 75:  # Threshold can be adjusted
                matched_soils.append(best_match[0].upper())

        # Return the closest matches found using rapidfuzz, if any
        if matched_soils:
            return ','.join(set(matched_soils))  # Remove duplicates and join

        # Check for USCS code within parentheses (if provided)
        match = re.search(r'\((.*?)\)', text)
        if match:
            return match.group(1).upper()

    return 'unknown'

# Applying the extract_uscs function to the DataFrame
df_svsim['USCS'] = df_svsim.apply(lambda row: extract_uscs(row['KEYWORDS'], row['USCS']), axis=1)

# Print the updated DataFrame
df_svsim.head()

In [None]:
# Load the provided Excel file
file_path = r'C:\Users\betebari\Documents\C2VSim_Texture\OSWCR\USCS-averageKxy-CoarseFractions.xlsx'
excel_data = pd.read_excel(file_path)

# Strip any leading/trailing spaces from 'Sediment/Rock Type' column in the Excel data
excel_data['Sediment/Rock Type'] = excel_data['Sediment/Rock Type'].str.strip()

# Convert 'Sediment/Rock Type' to lowercase for case-insensitive matching
excel_data['Sediment/Rock Type'] = excel_data['Sediment/Rock Type'].str.lower()

# Create a dictionary mapping Soil Classification to Average Hydraulic Conductivity (case-insensitive)
hydraulic_conductivity_mapping = dict(zip(excel_data['Sediment/Rock Type'], excel_data['Average Hydraulic Conductivity (ft/day)']))

# Create a dictionary mapping Soil Classification to Average Coarse Fraction (case-insensitive)
coarse_fraction_mapping = dict(zip(excel_data['Sediment/Rock Type'], excel_data['Average Coarse Fraction (%)']))

# Convert 'USCS' column to lowercase for case-insensitive matching
df_svsim['USCS'] = df_svsim['USCS'].str.lower()

# Function to handle the slash (50/50) and dash (sequential rule) logic
def aggregate_uscs_values(uscs_value, mapping, agg_func='average'):
    # Split USCS string into parts by commas, slashes, and dashes
    if '/' in uscs_value:
        # For slash, treat as 50/50
        uscs_list = [item.strip().lower() for item in uscs_value.split('/')]
        values = [mapping.get(uscs) for uscs in uscs_list if uscs in mapping]
        # Take the average for 50/50 mixtures
        if values:
            return sum(values) / len(values)
    elif '-' in uscs_value:
        # For dash, follow the first one that matches
        uscs_list = [item.strip().lower() for item in uscs_value.split('-')]
        for uscs in uscs_list:
            if uscs in mapping:
                return mapping.get(uscs)  # Return the first match
    else:
        # If no special characters, treat as a single or comma-separated list
        uscs_list = [item.strip().lower() for item in uscs_value.split(',')]
        values = [mapping.get(uscs) for uscs in uscs_list if uscs in mapping]

    # Apply 12% coarse fraction if secondary USCS classification is present and no match found
    if 'gc' in uscs_list or 'sc' in uscs_list or 'gm' in uscs_list or 'sm' in uscs_list:
        return 12 if not values else sum(values) / len(values)

    if values:
        if agg_func == 'average':
            return sum(values) / len(values)
        elif agg_func == 'max':
            return max(values)
    return None

# Apply the aggregation function for Hydraulic Conductivity and Coarse Fraction (case-insensitive)
df_svsim['HydraulicConductivity'] = df_svsim['USCS'].apply(lambda x: aggregate_uscs_values(x, hydraulic_conductivity_mapping, agg_func='average'))
df_svsim['AverageCoarseFraction'] = df_svsim['USCS'].apply(lambda x: aggregate_uscs_values(x, coarse_fraction_mapping, agg_func='average'))

# Identify and display any unmatched values
unmatched_values = df_svsim[df_svsim['HydraulicConductivity'].isna()]['USCS'].unique()
print("Unmatched 'USCS' values:", unmatched_values)

# Clean data
if 'Unnamed: 0' in df_svsim.columns:
    df_svsim = df_svsim.drop(['Unnamed: 0'], axis=1)

# Convert the 'USCS' column to uppercase
df_svsim['USCS'] = df_svsim['USCS'].str.upper()

df_svsim.rename(columns={'SVSIM_NAME': 'WCRNUMBER'}, inplace=True)

# Get the number of unique values in the 'WCRNUMBER' column
unique_wcrnumber_count = df_svsim['WCRNUMBER'].nunique()
print(f"Number of unique WCRNUMBER values: {unique_wcrnumber_count}")

# Display the first few rows of the merged dataframe
df_svsim.head()

In [None]:
# Save the updated DataFrame to a new CSV file
output_file = '11-updated_SVSim.csv'
df_svsim.to_csv(output_file, index=False)

print(f"Final updated CSV file saved as '{output_file}'")