In [1]:
import pandas as pd
import re

# import csv files as Pandas dataframes

In [2]:
# geologiclog_USCS ; released by Ben Brezing on OpenData/OSWCR 09/26/2024
df = pd.read_csv("geologiclog_uscs.csv", encoding='utf-8-sig')

df.head()

Unnamed: 0,WCRNUMBER,INTERVALSTART,INTERVALEND,SOILCLASSIFICATION,SOILCOLOR,SOILDESCRIPTION
0,WCR2019-015573,7.0,10.0,SW Well-graded SAND,,
1,WCR2019-015573,10.0,13.0,SW Well-graded SAND,,
2,WCR2019-015573,13.0,25.0,SC Clayey SAND,,
3,WCR2019-015573,25.0,26.0,ML Inorganic SILT with low plasticity,,
4,WCR2019-015573,26.0,28.0,SC Clayey SAND,,


In [3]:
# Function to extract text within parentheses
def extract_uscs(text):
    if isinstance(text, str):
        match = re.search(r'\((.*?)\)', text)
        if match:
            return match.group(1)
    return 'unknown'

# Apply the function to the 'SOILCLASSIFICATION' column and create a new 'USCS' column
df['USCS'] = df['SOILCLASSIFICATION'].apply(extract_uscs)

In [4]:
# Display the DataFrame to verify the new column
df.head()

Unnamed: 0,WCRNUMBER,INTERVALSTART,INTERVALEND,SOILCLASSIFICATION,SOILCOLOR,SOILDESCRIPTION,USCS
0,WCR2019-015573,7.0,10.0,SW Well-graded SAND,,,unknown
1,WCR2019-015573,10.0,13.0,SW Well-graded SAND,,,unknown
2,WCR2019-015573,13.0,25.0,SC Clayey SAND,,,unknown
3,WCR2019-015573,25.0,26.0,ML Inorganic SILT with low plasticity,,,unknown
4,WCR2019-015573,26.0,28.0,SC Clayey SAND,,,unknown


In [5]:
# Set of known USCS classifications
known_uscs = {
    'ML', 'CH', 'CL', 'MH', 'SC', 'SM', 'SP', 'SW', 'GW', 'GP', 'GM', 'GC', 'OH',
    'SM-SC', 'SM-ML', 'SM-GP', 'SP-GP', 'SW-GW', 'SC-GC', 'SC-ML', 'SP/GP/CL', 'CL/SP',
    'GW-GM', 'OH/CH', 'CL/CH', 'Cl', 'SW-SM', 'SW-SC', 'SP-SM', 'SP-SC', 'OL', 'GW-GC', 'GP-SP',
    'GP-GM', 'GP-GC', 'GP-CG', 'CL-ML', 'CH+GP',
    # New USCS classifications to be added
    'ASH', 'SM-SW', 'ML/GW', 'SP/GP', 'CH/ML', 'CL/ML', 'SC', 'CL/SC', 'SM/SC', 'GP/SP/CL',
    'GP/GC', 'SP/SC', 'SP/CH', 'GP/SP', 'CH/SP', 'GP/CL'
}

In [6]:
# Dictionary to map specific soil classifications to USCS or rock categories
classification_mapping = {
    'Rock - Sedimentary': 'ROCK',
    'Rock - Igneous': 'IGNS',
    'Rock - Metamorphic': 'META',
    'Siltstone': 'STST',
    'Sandstone': 'SDST',
    
    'Bedrock': 'ROCK',
    'SILTSTONE/MUDSTONE': 'STST,CLSN',
    'Topsoil': 'OH',
    'FILL': 'unknown',
    'Claystone/hardpan': 'CLSN,ML',
    'Siltstone-Claystone': 'STST,CLSN',
    'Silty-Sandstone': 'SDST,STST',
    'Ishi': 'VOLC',
    'Ash': 'ASH',
    'Tuff': 'TUFF',
    'sltst': 'STST',
    'clyst': 'CLSN',
    'Mudstone': 'CLSN',
    'sltst and clyst': 'STST,CLSN',
    'CLAYSTONE/SAND': 'CLSN,SC',
    'CLAYSTONE': 'CLSN',
    'TUFF or SILTSTONE/CLAYSTONE': 'TUFF,STST,CLSN',
    'SILTSTONE/SANDSTONE': 'STST,SDST',
    'SIlTSTONE/CLAYSTONE': 'STST,CLSN',
    'Sand/Sandstone': 'SDST,SW',
    'Volcanic': 'volcanic rock',
    'PT PEAT soils with high organic contents': 'PT',
    'Claystone/hardpan,': 'CLSN,ML',
    'Siltstone-Claystone,': 'STST,CLSN',
    'Silty-Sandstone,': 'SDST, ML',
    'Basalt': 'BSLT',
    'Sltst': 'STST',
    'SILTSTONE':'STST',
    'SANDSTONE':'SDST',
    'Sandstone/Siltstone':'SDST,STST',
    'SANDSTONE/SC':'SDST,SC',
    'MUDSTONE':'CLSN',
    'BASALT':'BSLT',
    'CONGLOMERATE':'CONG',
    'GP/CH':'GP,CH',
    'Claystone':'CLSN',
    'CLAYSTONE/MUDSTONE':'CLSN',
    'TUFF':'TUFF',
    'Top soil':'TPSL',
    'SILTSTONE/SANDSTONE,':'STST,SDST',
    'SilTSTONE/CLAYSTONE':'STST,CLSN',
    'GM/SP':'GM,SP',
    'Tuff/ASH':'TUFF, ASH',

        # Including the ones with BOM characters manually removed
    'ï»¿GC': 'GC',
    'ï»¿SP': 'SP',
    'ï»¿CL': 'CL',

}

In [7]:
# Function to extract USCS classification
def extract_uscs(text):
    if isinstance(text, str):
        text = text.strip()  # Remove leading/trailing whitespace
        # Remove BOM characters if present
        text = text.encode('utf-8').decode('utf-8-sig')
        # Check for direct USCS classification
        for uscs in known_uscs:
            if uscs in text.split():
                return uscs
        # Check for known classifications in the mapping
        if text in classification_mapping:
            return classification_mapping[text]
        # If no direct classification, look for text within parentheses
        match = re.search(r'\((.*?)\)', text)
        if match:
            return match.group(1)
    return 'unknown'

# Apply the function to the 'SOILCLASSIFICATION' column and create a new 'USCS' column
df['USCS'] = df['SOILCLASSIFICATION'].apply(extract_uscs)

# Display the DataFrame to verify the new column
df.head()

Unnamed: 0,WCRNUMBER,INTERVALSTART,INTERVALEND,SOILCLASSIFICATION,SOILCOLOR,SOILDESCRIPTION,USCS
0,WCR2019-015573,7.0,10.0,SW Well-graded SAND,,,SW
1,WCR2019-015573,10.0,13.0,SW Well-graded SAND,,,SW
2,WCR2019-015573,13.0,25.0,SC Clayey SAND,,,SC
3,WCR2019-015573,25.0,26.0,ML Inorganic SILT with low plasticity,,,ML
4,WCR2019-015573,26.0,28.0,SC Clayey SAND,,,SC


In [8]:
# Load the provided Excel file
file_path = r'C:\Users\betebari\Documents\C2VSim_Texture\OSWCR\USCS-averageKxy-CoarseFractions.xlsx'
excel_data = pd.read_excel(file_path)

# Strip any leading/trailing spaces from 'Sediment/Rock Type' column in the Excel data
excel_data['Sediment/Rock Type'] = excel_data['Sediment/Rock Type'].str.strip()

# Convert 'Sediment/Rock Type' to lowercase for case-insensitive matching
excel_data['Sediment/Rock Type'] = excel_data['Sediment/Rock Type'].str.lower()

# Create a dictionary mapping Soil Classification to Average Hydraulic Conductivity (case-insensitive)
hydraulic_conductivity_mapping = dict(zip(excel_data['Sediment/Rock Type'], excel_data['Average Hydraulic Conductivity (ft/day)']))

# Create a dictionary mapping Soil Classification to Average Coarse Fraction (case-insensitive)
coarse_fraction_mapping = dict(zip(excel_data['Sediment/Rock Type'], excel_data['Average Coarse Fraction (%)']))

# Strip any leading/trailing spaces from 'USCS' column in the merged DataFrame
df['USCS'] = df['USCS'].str.strip()

# Remove parentheses from USCS values
df['USCS'] = df['USCS'].str.replace(r'[\(\)]', '', regex=True)

# Convert 'USCS' column to lowercase for case-insensitive matching
df['USCS'] = df['USCS'].str.lower()

# Function to calculate the average or maximum for multiple USCS classifications, with special handling for secondary classifications
def aggregate_uscs_values(uscs_value, mapping, agg_func='average'):
    # Convert the list of USCS values to lowercase for case-insensitive matching
    uscs_list = [item.strip().lower() for item in uscs_value.split(',')]
    
    values = [mapping.get(uscs) for uscs in uscs_list if uscs in mapping]
    
    # Apply 12% coarse fraction if secondary USCS classification is present and the 'coarse_fraction_mapping' is missing
    if 'gc' in uscs_list or 'sc' in uscs_list or 'gm' in uscs_list or 'sm' in uscs_list:
        if agg_func == 'average':
            # Assign 12% when coarse fraction data is not found for the given classification
            return sum(values) / len(values) if values else 12
        elif agg_func == 'max':
            return max(values) if values else 12
    
    if values:
        if agg_func == 'average':
            return sum(values) / len(values)
        elif agg_func == 'max':
            return max(values)
    return None

# Apply the aggregation function for Hydraulic Conductivity and Coarse Fraction (case-insensitive)
df['HydraulicConductivity'] = df['USCS'].apply(lambda x: aggregate_uscs_values(x, hydraulic_conductivity_mapping, agg_func='average'))
df['AverageCoarseFraction'] = df['USCS'].apply(lambda x: aggregate_uscs_values(x, coarse_fraction_mapping, agg_func='average'))
df['Avg Specific Yield (%)'] = df['USCS'].apply(lambda x: aggregate_uscs_values(x, coarse_fraction_mapping, agg_func='average'))
df['Avg Ss (1/L)'] = df['USCS'].apply(lambda x: aggregate_uscs_values(x, coarse_fraction_mapping, agg_func='average'))
df['Avg Kv (ft/day)'] = df['USCS'].apply(lambda x: aggregate_uscs_values(x, coarse_fraction_mapping, agg_func='average'))

# Identify and display any unmatched values
unmatched_values = df[df['HydraulicConductivity'].isna()]['USCS'].unique()
print("Unmatched 'USCS' values:", unmatched_values)

# Clean data
if 'Unnamed: 0' in df.columns:
    df = df.drop(['Unnamed: 0'], axis=1)

# Display the first few rows of the merged dataframe
df.head()

Unmatched 'USCS' values: ['volcanic rock' 'unknown' 'ml/gw' 'sm-sw' 'ch/ml' 'sm/sc' 'sp/ch' 'ch/sp']


Unnamed: 0,WCRNUMBER,INTERVALSTART,INTERVALEND,SOILCLASSIFICATION,SOILCOLOR,SOILDESCRIPTION,USCS,HydraulicConductivity,AverageCoarseFraction,Avg Specific Yield (%),Avg Ss (1/L),Avg Kv (ft/day)
0,WCR2019-015573,7.0,10.0,SW Well-graded SAND,,,sw,4.51,60.0,60.0,60.0,60.0
1,WCR2019-015573,10.0,13.0,SW Well-graded SAND,,,sw,4.51,60.0,60.0,60.0,60.0
2,WCR2019-015573,13.0,25.0,SC Clayey SAND,,,sc,1.6564,40.0,40.0,40.0,40.0
3,WCR2019-015573,25.0,26.0,ML Inorganic SILT with low plasticity,,,ml,0.164016,15.0,15.0,15.0,15.0
4,WCR2019-015573,26.0,28.0,SC Clayey SAND,,,sc,1.6564,40.0,40.0,40.0,40.0


In [9]:
df1 = pd.read_csv("UPDATED_wellcompletionreports.csv")
# Merge df1 and df2 on 'WCRNUMBER'
merged_df = pd.merge(df1, df, on='WCRNUMBER', how='inner')

# Drop rows where 'DESCRIPTION' is empty (NaN or empty string)
merged_df = merged_df[merged_df['INTERVALSTART'].notna() & (merged_df['INTERVALSTART'] != '')]

# Convert the 'USCS' column to uppercase
merged_df['USCS'] = merged_df['USCS'].str.upper()

# Display the DataFrame
merged_df.head()

Unnamed: 0.1,Unnamed: 0,WCRNUMBER,DECIMALLAT,DECIMALLON,UTMX_y,UTMY_y,INTERVALSTART,INTERVALEND,SOILCLASSIFICATION,SOILCOLOR,SOILDESCRIPTION,USCS,HydraulicConductivity,AverageCoarseFraction,Avg Specific Yield (%),Avg Ss (1/L),Avg Kv (ft/day)
0,1871,WCR2018-004813,35.2014,-118.923,871225.039877,3902999.0,0.0,7.0,SM Silty SAND,med to dk yellow brown,"Silty Sand - vfn gr sand, damp",SM,9.1102,45.0,45.0,45.0,45.0
1,1871,WCR2018-004813,35.2014,-118.923,871225.039877,3902999.0,7.0,30.0,SW Well-graded SAND,lt-med yellow brown,"Sand - fn to v crs gr, trace gravel to 1/8""",SW,4.51,60.0,60.0,60.0,60.0
2,1871,WCR2018-004813,35.2014,-118.923,871225.039877,3902999.0,30.0,37.0,CL Lean inorganic CLAY with low plasticity,olive brown,"Clay - moist, tr sand, low plast",CL,0.016402,5.0,5.0,5.0,5.0
3,1871,WCR2018-004813,35.2014,-118.923,871225.039877,3902999.0,37.0,69.0,SW Well-graded SAND,lt - med yellow brown,"Sand - lt-med yellow brown, fn - v crs gr, som...",SW,4.51,60.0,60.0,60.0,60.0
4,1871,WCR2018-004813,35.2014,-118.923,871225.039877,3902999.0,69.0,71.0,GW-GM Well-graded GRAVEL with silt,lt-med brown,Gravelly interval,GW-GM,4.9692,62.5,62.5,62.5,62.5


In [10]:
# Save the updated DataFrame to a new CSV file
output_file = '3-updated_geologiclog_USCS.csv'
merged_df.to_csv(output_file, index=False)

print(f"Updated CSV file saved as '{output_file}'")

Updated CSV file saved as '3-updated_geologiclog_USCS.csv'
