In [1]:
import pandas as pd
import re

# import csv files as Pandas dataframes

In [2]:
# geologiclog_freeform by 2023-04-28
df = pd.read_csv("geologiclog_uscs.csv", encoding='utf-8-sig')

print(df)

            WCRNUMBER  INTERVALSTART  INTERVALEND  \
0      WCR2022-013212          159.0        160.0   
1      WCR2022-013212          160.0        165.0   
2      WCR2022-013212          165.0        173.0   
3      WCR2022-013212          173.0        175.0   
4      WCR2022-013212          175.0        180.0   
...               ...            ...          ...   
19340  WCR2019-003540           15.5         21.5   
19341  WCR2019-003540           21.5         30.0   
19342  WCR2019-003540           30.0         33.5   
19343  WCR2019-003540           33.5         37.5   
19344  WCR2019-003540           37.5         41.5   

                          SOILCLASSIFICATION                       SOILCOLOR  \
0                            (ML) Sandy silt  light brownish gray (10YR 6/2)   
1                              (CH) Fat clay               brown (7.5YR 4/4)   
2                    (CH) Fat clay with sand                brown (10YR 4/3)   
3                            (SM) Silty san

In [3]:
# Function to extract text within parentheses
def extract_uscs(text):
    if isinstance(text, str):
        match = re.search(r'\((.*?)\)', text)
        if match:
            return match.group(1)
    return 'unknown'

# Apply the function to the 'SOILCLASSIFICATION' column and create a new 'USCS' column
df['USCS'] = df['SOILCLASSIFICATION'].apply(extract_uscs)

In [4]:
# Display the DataFrame to verify the new column
print(df.head())

        WCRNUMBER  INTERVALSTART  INTERVALEND       SOILCLASSIFICATION  \
0  WCR2022-013212          159.0        160.0          (ML) Sandy silt   
1  WCR2022-013212          160.0        165.0            (CH) Fat clay   
2  WCR2022-013212          165.0        173.0  (CH) Fat clay with sand   
3  WCR2022-013212          173.0        175.0          (SM) Silty sand   
4  WCR2022-013212          175.0        180.0            (CH) Fat clay   

                        SOILCOLOR  \
0  light brownish gray (10YR 6/2)   
1               brown (7.5YR 4/4)   
2                brown (10YR 4/3)   
3                brown (10YR 4/3)   
4                brown (10YR 4/3)   

                                     SOILDESCRIPTION USCS  
0  80% non-plastic fines, 20% fine sand, hard con...   ML  
1  High-plasticity, very hard consistency, dry to...   CH  
2  80% high-plasticity fines, 20% fine sand, medi...   CH  
3  SILTY SAND (SM) - 50% fine sand, 20% medium sa...   SM  
4  FAT CLAY (CH) - High-plastici

In [5]:
# Set of known USCS classifications
known_uscs = {'ML', 'CH', 'CL', 'MH', 'SC', 'SM', 'SP', 'SW', 'GW', 'GP', 'GM', 'GC','OH',
        'SM-SC','SM-ML','SM-GP','SP-GP', 'SW-GW','SC-GC','SC-ML','SP/GP/CL','CL/SP',
        'GW-GM','OH/CH','CL/CH', 'Cl','SW-SM','SW-SC','SP-SM','SP-SC','OL','GW-GC','GP-SP',
        'GP-GM','GP-GC','GP-CG','CL-ML','CH+GP'   
             }

In [6]:
# Dictionary to map specific soil classifications to USCS or rock categories
classification_mapping = {
    'Rock - Sedimentary': 'Sedimentary Rock',
    'Rock - Igneous': 'Igneous Rock',
    'Rock - Metamorphic': 'Metamorphic Rock',
    'Siltstone': 'Sedimentary Rock',
    'Sandstone': 'Sedimentary Rock',
    'Bedrock': 'Sedimentary Rock',
    'SILTSTONE/MUDSTONE': 'Sedimentary Rock',
    'Topsoil': 'OH',
    'Sandstone': ' Sedimentary Rock',
    'TUFF': 'Vocanic Rock',
    'FILL': 'Unknown',
    'Claystone/hardpan': 'Sedimentary Rock',
    'Siltstone-Claystone': 'Sedimentary Rock',
    'Silty-Sandstone': 'Sedimentary Rock',
    'Ishi': 'Volcanic Rock',
    'Tuff/ASH': 'Volcanic Rock',
    'sltst': 'Sedimentary Rock',
    'clyst': 'Sedimentary Rock',
    'sltst and clyst': 'Sedimentary Rock',
    'CLAYSTONE/SAND': 'Sedimentary Rock',
    'CLAYSTONE': 'Sedimentary Rock',
    'TUFF or SILTSTONE/CLAYSTONE': 'Metasedimentary Rock',
    'SILTSTONE/SANDSTONE': 'Sedimentary Rock',
    'SILTSTONE/SANDSTONE,': 'Sedimentary Rock',
    'SilTSTONE/CLAYSTONE': 'Sedimentary Rock',
    'Ash': 'Volcanic Rock',
    'Sand/Sandstone': 'Sedimentary Rock',
    'Volcanic': 'Volcanic Rock',
    'PT PEAT soils with high organic contents': 'PT',
    'SILTSTONE': 'Sedimentary Rock',
    'SANDSTONE': 'Sedimentary Rock',
    'Claystone/hardpan,': 'CL',
    'Siltstone-Claystone,': 'Sedimentary Rock',
    'Silty-Sandstone,': 'Sedimentary Rock',
    'Tuff': 'Volcanic Rock',
    'Sltst': 'Sedimentary Rock',
        # Including the ones with BOM characters manually removed
    'ï»¿GC': 'GC',
    'ï»¿SP': 'SP',
    'ï»¿CL': 'CL',

}

In [7]:
# Function to extract USCS classification
def extract_uscs(text):
    if isinstance(text, str):
        text = text.strip()  # Remove leading/trailing whitespace
        # Remove BOM characters if present
        text = text.encode('utf-8').decode('utf-8-sig')
        # Check for direct USCS classification
        for uscs in known_uscs:
            if uscs in text.split():
                return uscs
        # Check for known classifications in the mapping
        if text in classification_mapping:
            return classification_mapping[text]
        # If no direct classification, look for text within parentheses
        match = re.search(r'\((.*?)\)', text)
        if match:
            return match.group(1)
    return 'unknown'

# Apply the function to the 'SOILCLASSIFICATION' column and create a new 'USCS' column
df['USCS'] = df['SOILCLASSIFICATION'].apply(extract_uscs)

In [8]:
# Display the DataFrame to verify the new column
print(df.head())

# Save the updated DataFrame to a new CSV file
output_file = 'updated_geologiclog_USCS.csv'
df.to_csv(output_file, index=False)

print(f"Updated CSV file saved as '{output_file}'")

        WCRNUMBER  INTERVALSTART  INTERVALEND       SOILCLASSIFICATION  \
0  WCR2022-013212          159.0        160.0          (ML) Sandy silt   
1  WCR2022-013212          160.0        165.0            (CH) Fat clay   
2  WCR2022-013212          165.0        173.0  (CH) Fat clay with sand   
3  WCR2022-013212          173.0        175.0          (SM) Silty sand   
4  WCR2022-013212          175.0        180.0            (CH) Fat clay   

                        SOILCOLOR  \
0  light brownish gray (10YR 6/2)   
1               brown (7.5YR 4/4)   
2                brown (10YR 4/3)   
3                brown (10YR 4/3)   
4                brown (10YR 4/3)   

                                     SOILDESCRIPTION USCS  
0  80% non-plastic fines, 20% fine sand, hard con...   ML  
1  High-plasticity, very hard consistency, dry to...   CH  
2  80% high-plasticity fines, 20% fine sand, medi...   CH  
3  SILTY SAND (SM) - 50% fine sand, 20% medium sa...   SM  
4  FAT CLAY (CH) - High-plastici