In [9]:
import pandas as pd
import re

# import csv files as Pandas dataframes

In [10]:
# geologiclog_generalizedlithology ; released by Ben Brezing on OpenData/OSWCR 09/26/2024
df = pd.read_csv("geologiclog_generalizedlithology.csv", encoding='utf-8-sig')

df.head()

Unnamed: 0,WCRNUMBER,INTERVALSTART,INTERVALEND,TEXTURE,TEXTUREQUALIFIER,TEXTUREMODIFIER1,TEXTUREMODIFIER2,COLOR1,COLOR2,COLORQUALIFIER,CLASSIFICATION
0,WCR2002-009795,280,310,Siltstone,,Muddy,Sandy,,,,Fine
1,WCR2002-009795,310,320,Sand,,Clayey,,,,,Coarse
2,WCR2002-009795,320,350,Sand,,Gravelly,,,,,Coarse
3,WCR2002-009795,350,360,Siltstone,,Muddy,Sandy,,,,Fine
4,WCR2002-009795,360,400,Gravel,,,,Grey,,Light,Coarse


In [11]:
# Function to concatenate if contents are not equal
def concat_if_not_equal(row):
    # Convert values to string if they are not NaN, otherwise handle them appropriately
    texture_modifier = str(row['TEXTUREMODIFIER1']) if pd.notna(row['TEXTUREMODIFIER1']) else ''
    texture = str(row['TEXTURE']) if pd.notna(row['TEXTURE']) else ''
    
    # Check if the two values are different and concatenate them
    if texture_modifier != '' and texture_modifier != texture:
        return texture_modifier + ' ' + texture
    else:
        return texture

# Apply the function to create the new column
df['TEXTURE_MODIFIED'] = df.apply(concat_if_not_equal, axis=1)

# Drop the individual columns if not needed anymore
df.drop(columns=['TEXTUREMODIFIER1', 'TEXTURE'], inplace=True)

# Display the first few rows to verify
df.head()

Unnamed: 0,WCRNUMBER,INTERVALSTART,INTERVALEND,TEXTUREQUALIFIER,TEXTUREMODIFIER2,COLOR1,COLOR2,COLORQUALIFIER,CLASSIFICATION,TEXTURE_MODIFIED
0,WCR2002-009795,280,310,,Sandy,,,,Fine,Muddy Siltstone
1,WCR2002-009795,310,320,,,,,,Coarse,Clayey Sand
2,WCR2002-009795,320,350,,,,,,Coarse,Gravelly Sand
3,WCR2002-009795,350,360,,Sandy,,,,Fine,Muddy Siltstone
4,WCR2002-009795,360,400,,,Grey,,Light,Coarse,Gravel


In [12]:
# Dictionary to map specific soil classifications to USCS or rock categories
classification_mapping = {
    'gravelly sand': 'SW',
    'top soil': 'OH',
    'shale': 'SHLE',
    'clayey gravel': 'GC',
    'gravelly shale': 'CL',
    'clay': 'CH',
    'sandy clay': 'CL',
    'sand': 'SW',
    'silty clay': 'CL',
    'bouldery gravel': 'GP',
    'gravel': 'GP',
    'clayey sand': 'SC',
    'clayey top soil': 'OH-CH',
    'sandstone': 'SDST',
    'sandy silt': 'ML',
    'rock': 'ROCK',
    'hard pan': 'ML',
    'muddy sand': 'SC',
    'sandy top soil': 'OH-SP',
    'sandy gravel': 'GW',
    'gravelly cobbles': 'GP',
    'gravelly sandstone': 'SDST,GP',
    'rocky clay': 'GC',
    'pebbley gravel': 'GW',
    'pebbley sand': 'SW-GW',
    'clay': 'CH',
    'silt': 'ML',
    'gravelly  sand':'SP-GW', 
    'gravelly sand':'SP-GW',  
    'rocky sand': 'SP-GP',
    'sandy hard pan':'ML',
    'pebbley clay':'GC' ,
    'shaley silt': 'ML',
    'sandy rock': 'ROCK, SP',
    'sticky clay': 'CH',
    'rocky gravel': 'GW',
    'shaley sand': 'SC',
    'fill': 'unknown',
    
    'sandy clay':'CL' ,
    'clayey shale': 'SHLE,CL',
    'clayey shale': 'SHLE,CL',
    'sandy shale': 'SHLE,SC',
    'pebbles': 'PEBL',
    'silty sand': 'SM',
    'silty top soil': 'OH',
    'wood': 'PT',
    'bouldery clay': 'GC',
    'conglomerate': 'CONG',
    'bouldery sand':'SP-GP' ,
    'boulders': 'GW',
    'gravelly mud': 'GC',
    'bouldery rock': 'CONG',
    'shells': 'GP-SP',
    
    'shelly sand': 'SW-SC',
    'shaley clay': 'CL',
    'cobbles': 'GW',
    'other': 'unknown',
    
    'bouldery hard pan': 'ML-GP',
    'gravelly boulders': 'GP',
    'cobbley sand': 'SP-GP',
    'shaley gravel': 'GC' ,
    'sandy siltstone': 'STST, SP',
    'siltstone': 'STST',
    'gravelly siltstone': 'STST,GM',
    'clayey siltstone': 'STST,CL',
    'clayey silt': 'ML',
    'claystone/sand': 'CLSN,SC',
    'silty claystone': 'CLSN,ML',
    'sandy claystone': 'CLSN,SC',
    'silty sandstone': 'SDST,ML',
    'silty ash': 'ASH,ML',
    
    'cobbley clay': 'GC',
    'clayey rock': 'ROCK,CL',
    'silty gravel': 'GM',
    'gravelly rock': 'CONG',
    'gravelly silt': 'ML', 
    'gravelly clay': 'GC',
    'sandy ash': 'SP',
    
    'coarse sand': 'SP',
    'pebbley cobbles': 'GP',
    'lava': 'LAVA',
    'sand': 'SW',
    'soil': 'OH' ,
    'mud': 'CH' ,
    'peat':'PT',
    'shaley rock': 'SHLE',
    'sandstone clay': 'SDST,CLSN',
    'rocky sandstone': 'SDST,CONG',
    'silty cobbles': 'GM',
    'ash': 'ASH',
    'sandy sand': 'SW',
    'rocky conglomerate': 'CONG',
    'gravelly gravel':'GW',
    'sandy conglomerate': 'CONG,SDST',
    'gravelly conglomerate': 'CONG',
    'shaley sandstone': 'SDST,SHLE',
    'rocky shale': 'SHLE,ROCK',
    'cobbley gravel':'GW',
    'muddy siltstone': 'STST,CLSN',
    'clayey sandstone': 'SDST,SC',
    'clayey ash': 'ASH,CL',
    'clayey sanstone': 'SDST,CLSN',
    'gravelly sanstone': 'SDST,CONG',
    'sandy boulders':'GP-SP',
    'claystone': 'CLSN',
    'clayey clay': 'CH',
    'silty  sand': 'SM',
    'gravelly claystone': 'CLSN,GC',
    'sandy gravel':'GP-SP', 

}

In [13]:
# Function to normalize the 'TEXTURE_MODIFIED' values and map to USCS
def normalize_and_map(texture):
    if isinstance(texture, str):
        # Convert to lowercase and strip extra spaces
        texture = texture.lower().strip()
        # Split the texture string into words
        words = texture.split()
        
        # Only process if the words list is not empty
        if len(words) > 0:
            # Reconstruct texture, ensuring one space after the first word
            if len(words) > 1:
                texture = words[0] + ' ' + ' '.join(words[1:])
            else:
                texture = words[0]
            # Map the normalized texture to USCS
            return classification_mapping.get(texture, 'unknown')
        else:
            # If the texture is empty after stripping
            return 'unknown'
    else:
        return 'unknown'

# Apply the function to the DataFrame
if 'TEXTURE_MODIFIED' in df.columns:
    df['USCS'] = df['TEXTURE_MODIFIED'].apply(normalize_and_map)
else:
    print("Column 'TEXTURE_MODIFIED' is missing from the DataFrame.")

# Display the first few rows
df.head()

Unnamed: 0,WCRNUMBER,INTERVALSTART,INTERVALEND,TEXTUREQUALIFIER,TEXTUREMODIFIER2,COLOR1,COLOR2,COLORQUALIFIER,CLASSIFICATION,TEXTURE_MODIFIED,USCS
0,WCR2002-009795,280,310,,Sandy,,,,Fine,Muddy Siltstone,"STST,CLSN"
1,WCR2002-009795,310,320,,,,,,Coarse,Clayey Sand,SC
2,WCR2002-009795,320,350,,,,,,Coarse,Gravelly Sand,SP-GW
3,WCR2002-009795,350,360,,Sandy,,,,Fine,Muddy Siltstone,"STST,CLSN"
4,WCR2002-009795,360,400,,,Grey,,Light,Coarse,Gravel,GP


In [14]:
# Load the provided Excel file
file_path = r'C:\Users\betebari\Documents\C2VSim_Texture\OSWCR\USCS-averageKxy-CoarseFractions.xlsx'
excel_data = pd.read_excel(file_path)

# Strip any leading/trailing spaces from 'Sediment/Rock Type' column in the Excel data
excel_data['Sediment/Rock Type'] = excel_data['Sediment/Rock Type'].str.strip()

# Convert 'Sediment/Rock Type' to lowercase for case-insensitive matching
excel_data['Sediment/Rock Type'] = excel_data['Sediment/Rock Type'].str.lower()

# Create a dictionary mapping Soil Classification to Average Hydraulic Conductivity (case-insensitive)
hydraulic_conductivity_mapping = dict(zip(excel_data['Sediment/Rock Type'], excel_data['Average Hydraulic Conductivity (ft/day)']))

# Create a dictionary mapping Soil Classification to Average Coarse Fraction (case-insensitive)
coarse_fraction_mapping = dict(zip(excel_data['Sediment/Rock Type'], excel_data['Average Coarse Fraction (%)']))

# Strip any leading/trailing spaces from 'USCS' column in the merged DataFrame
df['USCS'] = df['USCS'].str.strip()

# Remove parentheses from USCS values
df['USCS'] = df['USCS'].str.replace(r'[\(\)]', '', regex=True)

# Convert 'USCS' column to lowercase for case-insensitive matching
df['USCS'] = df['USCS'].str.lower()

# Function to calculate the average or maximum for multiple USCS classifications, with special handling for secondary classifications
def aggregate_uscs_values(uscs_value, mapping, agg_func='average'):
    # Convert the list of USCS values to lowercase for case-insensitive matching
    uscs_list = [item.strip().lower() for item in uscs_value.split(',')]
    
    values = [mapping.get(uscs) for uscs in uscs_list if uscs in mapping]
    
    # Apply 12% coarse fraction if secondary USCS classification is present and the 'coarse_fraction_mapping' is missing
    if 'gc' in uscs_list or 'sc' in uscs_list or 'gm' in uscs_list or 'sm' in uscs_list:
        if agg_func == 'average':
            # Assign 12% when coarse fraction data is not found for the given classification
            return sum(values) / len(values) if values else 12
        elif agg_func == 'max':
            return max(values) if values else 12
    
    if values:
        if agg_func == 'average':
            return sum(values) / len(values)
        elif agg_func == 'max':
            return max(values)
    return None

# Apply the aggregation function for Hydraulic Conductivity and Coarse Fraction (case-insensitive)
df['HydraulicConductivity'] = df['USCS'].apply(lambda x: aggregate_uscs_values(x, hydraulic_conductivity_mapping, agg_func='average'))
df['AverageCoarseFraction'] = df['USCS'].apply(lambda x: aggregate_uscs_values(x, coarse_fraction_mapping, agg_func='average'))

# Identify and display any unmatched values
unmatched_values = df[df['HydraulicConductivity'].isna()]['USCS'].unique()
print("Unmatched 'USCS' values:", unmatched_values)

# Clean data
if 'Unnamed: 0' in df.columns:
    df = df.drop(['Unnamed: 0'], axis=1)

# Display the first few rows of the merged dataframe
df.head()

Unmatched 'USCS' values: ['unknown']


Unnamed: 0,WCRNUMBER,INTERVALSTART,INTERVALEND,TEXTUREQUALIFIER,TEXTUREMODIFIER2,COLOR1,COLOR2,COLORQUALIFIER,CLASSIFICATION,TEXTURE_MODIFIED,USCS,HydraulicConductivity,AverageCoarseFraction
0,WCR2002-009795,280,310,,Sandy,,,,Fine,Muddy Siltstone,"stst,clsn",0.0275,12.5
1,WCR2002-009795,310,320,,,,,,Coarse,Clayey Sand,sc,1.6564,40.0
2,WCR2002-009795,320,350,,,,,,Coarse,Gravelly Sand,sp-gw,225.0,57.5
3,WCR2002-009795,350,360,,Sandy,,,,Fine,Muddy Siltstone,"stst,clsn",0.0275,12.5
4,WCR2002-009795,360,400,,,Grey,,Light,Coarse,Gravel,gp,16.564,90.0


In [15]:
df1 = pd.read_csv("UPDATED_wellcompletionreports.csv")
# Merge df1 and df2 on 'WCRNUMBER'
merged_df = pd.merge(df1, df, on='WCRNUMBER', how='inner')

# Drop rows where 'DESCRIPTION' is empty (NaN or empty string)
merged_df = merged_df[merged_df['INTERVALSTART'].notna() & (merged_df['INTERVALSTART'] != '')]

# Convert the 'USCS' column to uppercase
merged_df['USCS'] = merged_df['USCS'].str.upper()

# Display the DataFrame
merged_df.head()

Unnamed: 0.1,Unnamed: 0,WCRNUMBER,DECIMALLAT,DECIMALLON,UTMX_y,UTMY_y,INTERVALSTART,INTERVALEND,TEXTUREQUALIFIER,TEXTUREMODIFIER2,COLOR1,COLOR2,COLORQUALIFIER,CLASSIFICATION,TEXTURE_MODIFIED,USCS,HydraulicConductivity,AverageCoarseFraction
0,2919,WCR0163017,35.2453,-119.119,853178.496514,3907155.0,0,6,,,,,,Fine,Sandy,UNKNOWN,,
1,2919,WCR0163017,35.2453,-119.119,853178.496514,3907155.0,6,35,,,Yellow,,,Fine,Clay,CH,0.016402,2.5
2,2919,WCR0163017,35.2453,-119.119,853178.496514,3907155.0,35,45,Coarse,,,,,Coarse,Sand,SW,4.51,60.0
3,2919,WCR0163017,35.2453,-119.119,853178.496514,3907155.0,45,60,,,Yellow,,,Fine,Clay,CH,0.016402,2.5
4,2919,WCR0163017,35.2453,-119.119,853178.496514,3907155.0,60,75,Coarse,,,,,Coarse,Sand,SW,4.51,60.0


In [16]:
# Save the updated DataFrame to a new CSV file
output_file = '4-updated_geologiclog_generalizedlithology.csv'
merged_df.to_csv(output_file, index=False)

print(f"Updated CSV file saved as '{output_file}'")

Updated CSV file saved as '4-updated_geologiclog_generalizedlithology.csv'
