In [1]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import process, fuzz



In [2]:
# Load the input CSV file
input_file = 'geologiclog_freeform.csv'  # Update with your file path

# Read the CSV file into a DataFrame
df = pd.read_csv(input_file)

  df = pd.read_csv(input_file)


In [3]:
# Check if the 'DESCRIPTION' column exists
if 'DESCRIPTION' not in df.columns:
    raise KeyError("The 'DESCRIPTION' column does not exist in the DataFrame.")


In [4]:
# Copy the existing column 'DESCRIPTION2' to a new column 'NEW_DESCRIPTION'
df['OLD_DESCRIPTION'] = df['DESCRIPTION']

In [5]:
# Define the USCS codes and known USCS names globally
uscs_codes = [
    '(ML)', '(GP)', '(SP-SM)', '(GC)', '(CL)', '(CH)', '(MH)', '(GP-GM)',
    '(GP-GC)', 'GM', '(ml)', 'SP-SM', '(SC-SM)', 'CH', '(SC)', 'SC', 'MH', 
    'SM-SC','SM-ML','SM-GP','SP-GP', 'SW-GW','SC-GC','SC-ML','SP/GP/CL','CL/SP',
    'GW-GM','OH/CH','CL/CH', 'Cl','SW-SM','SW-SC','SP-SM','SP-SC','OL','GW-GC','GP-SP',
    'GP-GM','GP-GC','GP-CG','CL-ML','CH+GP','SM/GM', 'sm/ml/sm/cl', 'sm/ml/sm/ml', 'sm/ml/sp/ml',
    'SM>ML>CL', 'SM>ML>SP>CL>SM', 'SM>ML,SP>SC>CL',
]

# Function to extract USCS codes from text within parentheses and direct codes
def extract_uscs(text):
    if isinstance(text, str):
        text = text.upper()  # Convert text to uppercase for case insensitivity

        # Check for USCS code within parentheses
        match = re.search(r'\((.*?)\)', text)
        if match:
            code_in_parentheses = match.group(1).strip()
            if code_in_parentheses in uscs_codes:
                return code_in_parentheses

        # Check if the text itself is a valid USCS code
        if text in uscs_codes:
            return text
    
    return 'unknown'

# Apply the function to the DESCRIPTION column to create the USCS column
df['USCS'] = df['DESCRIPTION'].apply(extract_uscs)

# Print the updated DataFrame
print(df.head())

        WCRNUMBER INTERVALSTART INTERVALEND    DESCRIPTION OLD_DESCRIPTION  \
0  WCR2023-007668         450.0       490.0        granite         granite   
1  WCR2023-007668         490.0       520.0           rock            rock   
2  WCR2023-007668         520.0       540.0  clay and rock   clay and rock   
3  WCR2023-007668         540.0       570.0           rock            rock   
4  WCR2023-007668         570.0       600.0        granite         granite   

      USCS  
0  unknown  
1  unknown  
2  unknown  
3  unknown  
4  unknown  


In [6]:
# Define a dictionary for category conversions
conversions = {
    'dg': 'decomposed granite',
    'd,g.': 'decomposed granite',
    'D/G': 'decomposed granite',
    'd.g': 'decomposed granite',
    'd & g': 'decomposed granite',
    'd,g.': 'decomposed granite',
    'D.G.': 'decomposed granite',
    'DG': 'decomposed granite',
    'dgf': 'decomposed granite',
    'quartz': 'decomposed granite',
    'cemented gravel': 'conglomerate',
    'sand stone': 'sandstone',
    'overburden': 'soil',
    'sand clay': 'clayey sand',
    'sand gravel' : 'gravely sand',
    'sand and clay': 'clayey sand',
    'sand and gravel': 'gravely sand',
    'gravel clay': 'clayey gravel',
    'clay gravel': 'gravely clay',
    'clay sand': 'sandy clay',
    'clay sandy': 'sandy clay',
    'gravel.sandy.':'sandy gravel',
    'sand gravel': 'gravely sand',
    'gravel and sand': 'sandy gravel',
    'sand & gravel': 'gravely sand',
    'mixed sands': 'poorly graded sand',
    'blacksand': 'black sand',
    'sand-coarse': 'coarse sand',
    'gravel with clay': 'clayey gravel',
    'sand-brown': 'sand brown',
    'clay-brown': 'clay brown',
    'sand-black': 'sand black',
    'course': 'coarse',
    'clay loam': 'clayey sand',
    'dirt': 'soil',
    'Sand stone': 'sandstone',
    'clay-blue': 'clay',
    'blue clay': 'clay',
    'channel': 'coarse',
    'grit': 'gravel',
    'cobble': 'gravel',
    'ashÂ': 'ash',
    'SERP.': 'serpentine',
    'SERP': 'serpentine',
    'Adobe': 'clay',
    'gray ash': 'ash',
    'mud': 'clay',
    'Sand-Clay': 'clayey sand',
    'sand,clay': 'clayey sand',
    'Sand M': 'sand medium',
    'Sand M-C': 'sand medium coarse',
    'Hard Pen': 'hardpan',
    'Birds Eye': 'gravel',
    'Birds eye': 'gravel',
    'Crystalized Minerals': 'gravel',
    'sand&clay': 'clayey sand',
    'Hard Clay': 'claystone',
    'IONE': 'sandstone',
    'Marsh Deposits': 'organic soil',
    'GrayWackey': 'sandstone',
    'Gray Wackey': 'sandstone',
    'Greywacke': 'sandstone',
    'SILTYCLAY': 'silty clay',
    'mudflow': 'clay',
    'MERTON': 'Mehrton frm',
    'Ground': 'soil',
    'Overberdon': 'soil',
    'PEAT (PT)': 'peat',
    'stopped drilling': 'claystone',
    'TOP SOIL': 'soil',
    'grey': 'gray',
    'boulders': 'boulder',
    'tuft': 'tuff',
    'bluish': 'blue',
    'quarts': 'quartz',
    'Over burden': 'soil',
    'colluvium': 'gravel',
    'alluvium': 'sandy gravel',
    'silty.sandy':'silty.sand',
    'shail' : 'shale',
    
    
    # Add more conversions as needed
}

In [7]:
# Function to apply conversions
def apply_conversions(text, conversions):
    if not isinstance(text, str):
        text = str(text)
    for key, value in conversions.items():
        text = re.sub(r'\b{}\b'.format(re.escape(key)), value, text, flags=re.IGNORECASE)
    return text

# Apply conversions to the DESCRIPTION column to create DESCRIPTION2
df['DESCRIPTION'] = df['DESCRIPTION'].apply(lambda x: apply_conversions(x, conversions))

# Convert all contents of DESCRIPTION2 column to lower case
df['DESCRIPTION'] = df['DESCRIPTION'].str.lower()

In [8]:
# Define the words to remove
words_to_remove = [
     'iron', 'H2O','mix', '@', '&', '*', 'grey', 'dry', 'B&W', 'blind','off','samples', 'no',
    'see', 'attached.', 'S&P', 'semi', 'please', 'log', 'geologic', 'log.', 'for', 'details.', 
    'find', 'below.', 'logging', 'by', 'core', 'rubble', 'bored', 'drilling', 'LLC.', 'old',
    'sample', 'too', 'small.', 'only', 'show', 'comp', 'sheet', 'attached', 'numbers', 'XX', 
    'pieces', 'rock.', 'as', 'above.', 'PT.', 'test', 'permit.', 'per','olive',  'off-', 'note', 
    'documents.', 'pile', 'cap', 'elev.', '@-', '&-', '*-', 'pothole', 'open', 'void','control', 
     'same', 'from', 'start', 'sample.', 'same.', 'all', 'piece', 'fill', 'N/A', 'and', 'to',
     'in','boring', 'nan', 
]

In [9]:
from rapidfuzz import process, fuzz

# Define a list of correct words
correct_words = [
    'red', 'green', 'black', 'brown', 'yellow', 'sand', 'gravel', 'clay', 'silt', 'greenstone',
    'sandy', 'gravely', 'clayey', 'silty', 'granite', 'basalt', 'volcanics', 'volcanic',
    'soil', 'organic', 'peat', 'sandstone', 'conglomerate', 'siltstone', 'diorite', 'laminated',
    'decomposed', 'fractured', 'rock', 'shale', 'claystone', 'fine', 'medium', 'pumice',
    'coarse', 'grained', 'sticky', 'cobble', 'latite', 'graded', 'loam', 'rock', 'poorly graded',
    'cemented', 'shell', 'quartz', 'quartzite', 'metasediment', 'slate', 'schist', 'solid',
    'granodiorite', 'hardpan', 'chert', 'igneous', 'serpentine', 'phyllite', 'limestone',
    'mudstone', 'andesite', 'hardpan', 'graywacke','boulder', 'weathered', 
]

def fix_typos(text, choices):
    # Check if the value is a string
    if isinstance(text, str):
        fixed_words = []
        # Split the text into words and iterate over them
        for word in text.split():
            # Clean the word from non-alphabetic characters except numbers, commas, and percentage signs
            cleaned_word = re.sub(r'[^a-zA-Z0-9,%%]', '', word)
            # Check if the cleaned word is not empty
            if cleaned_word.strip():
                # Find the best match for the cleaned word among the choices
                best_match = process.extractOne(cleaned_word, choices, scorer=fuzz.token_sort_ratio)
                # Append the best match to the list of fixed words
                if best_match:
                    fixed_words.append(best_match[0])
                else:
                    fixed_words.append(word)
        # Join the fixed words into a single string and return it
        return ' '.join(fixed_words)
    else:
        return text

# Assuming df is your DataFrame and it has a column named 'DESCRIPTION'
df['DESCRIPTION'] = df['DESCRIPTION'].apply(lambda x: fix_typos(x, correct_words))

In [10]:
# Set of known USCS classifications for direct mapping
known_uscs = {'ML', 'CH', 'CL', 'MH', 'SC', 'SM', 'SP', 'SW', 'GW', 'GP', 'GM', 'GC', 'OH','PT',}

In [11]:
def remove_words(text, words):
    # Escape special characters in words
    escaped_words = [re.escape(word) for word in words]
    # Join escaped words with '|', which represents 'or' in regex
    pattern = r'\b(?:' + '|'.join(escaped_words) + r'|[/&|:+?>\-;])\b'
    if isinstance(text, str):
        return re.sub(pattern, '', text, flags=re.IGNORECASE)
    else:
        return ''
    
# Function to convert characters
def convert_chars(text):
    return text.translate(str.maketrans('/&-|:+>?', '        '))

# Function to apply conversions
def apply_conversions(text, conversions):
    return ' '.join(conversions.get(word.lower(), word) for word in text.lower().split())

# Apply conversions and transformations
df['MODIFIED_DESCRIPTION'] = df['DESCRIPTION'].astype(str).apply(lambda x: apply_conversions(convert_chars(remove_words(x, words_to_remove)), conversions))

In [12]:
# Define qualifiers
color_qualifiers = ['red', 'green', 'black', 'brown', 'gray', 'grayish', 'white','greenish',
                    'reddish', 'yellow', 'dark', 'light', 'tan', 'colored', 'blue','brownblack',
                   'yellowish','purple', 'orange','brw.',   ]

texture_qualifiers = ['loose', 'hard', 'coarse', 'fine', 'compacted', 'cemented','crushed',
           'salt & pepper','Minor','medium', 'large','firm', 'small', 'fracture', 'frac',
           'fractured', 'soft', 'minor', 'eroded','tight','broken',
           'med.','packed', 'brittle', 'porous', 'pea', 'welded', 'mixed', 'softer', 'joint',
           'chunky','large', 'big','solid','firm','hard','heavy','very stiff','crystallized minerals',
           'laminated','poorly graded', 'well graded', 'sticky', 'grained','graded', 'shell',   

                     
                     ]

# Copy data from 'DESCRIPTION' to a new column 'TEXTURE'
df['TEXTURE'] = df['MODIFIED_DESCRIPTION']

# Function to extract qualifiers from a string
def extract_qualifiers(description, qualifiers):
    if pd.isna(description):
        return []
    words = description.lower().split()
    return [word for word in words if word in qualifiers]

# Extract COLORQUALIFIER and TEXTUREQUALIFIER
df['COLORQUALIFIER_EXTRACTED'] = df.apply(lambda row: extract_qualifiers(row['TEXTURE'], color_qualifiers), axis=1)
df['TEXTUREQUALIFIER_EXTRACTED'] = df.apply(lambda row: extract_qualifiers(row['TEXTURE'], texture_qualifiers), axis=1)

# Convert lists to strings
df['COLORQUALIFIER_EXTRACTED'] = df['COLORQUALIFIER_EXTRACTED'].apply(lambda x: ' '.join(x) if x else np.nan)
df['TEXTUREQUALIFIER_EXTRACTED'] = df['TEXTUREQUALIFIER_EXTRACTED'].apply(lambda x: ' '.join(x) if x else np.nan)

# Function to remove qualifiers from a string
def remove_qualifiers(description, qualifiers):
    if pd.isna(description):
        return description
    words = description.lower().split()
    return ' '.join([word for word in words if word not in qualifiers])

# Create DESCRIPTION2 column
df['NEW_DESCRIPTION'] = df.apply(lambda row: remove_qualifiers(row['TEXTURE'], color_qualifiers + texture_qualifiers), axis=1)

# Handle missing 'TEXTUREMODIFIER1' column
if 'TEXTUREMODIFIER1' not in df.columns:
    df['TEXTUREMODIFIER1'] = np.nan

# Concatenate TEXTUREMODIFIER1 and DESCRIPTION if they are not equal, and include new columns
df['TEXTURE_MODIFIED'] = df.apply(
    lambda row: (row['TEXTUREMODIFIER1'] + ' ' if pd.notna(row['TEXTUREMODIFIER1']) and row['TEXTUREMODIFIER1'] != row['TEXTURE'] else '') + (row['DESCRIPTION'] if pd.notna(row['DESCRIPTION']) else ''),
    axis=1
)

# Update original COLORQUALIFIER and TEXTUREQUALIFIER columns
df['COLORQUALIFIER'] = df['COLORQUALIFIER_EXTRACTED']
df['TEXTUREQUALIFIER'] = df['TEXTUREQUALIFIER_EXTRACTED']

# Drop the intermediate columns if needed
df.drop(columns=['COLORQUALIFIER_EXTRACTED', 'TEXTUREQUALIFIER_EXTRACTED','TEXTURE','TEXTUREMODIFIER1','TEXTURE_MODIFIED','MODIFIED_DESCRIPTION','DESCRIPTION', ],inplace=True)

print(df.head())

        WCRNUMBER INTERVALSTART INTERVALEND OLD_DESCRIPTION     USCS  \
0  WCR2023-007668         450.0       490.0         granite  unknown   
1  WCR2023-007668         490.0       520.0            rock  unknown   
2  WCR2023-007668         520.0       540.0   clay and rock  unknown   
3  WCR2023-007668         540.0       570.0            rock  unknown   
4  WCR2023-007668         570.0       600.0         granite  unknown   

  NEW_DESCRIPTION COLORQUALIFIER TEXTUREQUALIFIER  
0         granite            NaN              NaN  
1            rock            NaN              NaN  
2  clay sand rock            NaN              NaN  
3            rock            NaN              NaN  
4         granite            NaN              NaN  


In [13]:
# Dictionary to map specific soil classifications to USCS or rock categories
classification_mapping = {
    'clay' : 'CH',
    'clay sand': 'CL',
    'poorly graded gravel': 'GP',
    'well graded gravel': 'GW',
    'sand': 'SW',
    'gravel': 'GW',
    'soil' : 'OH', 
    'silty gravel': 'GM',
    'well graded sand': 'SW',
    'sandy clay' : 'CL',
    'gravely sand': 'SP',
    'gravely clay': 'CL',
    'Sediment' : 'SP',
    'alluvium': 'GP',
    
    'poorly graded sand': 'SP',
    'silty sand': 'SM',
    'sandy loam': 'SM',
    'clayey gravel': 'GC',
    'cobble clay': 'GC',
    'clayey sand': 'SC',
    'clayey silt': 'ML',
    'loam' : 'ML' ,
    'peat' : 'PT',
    'organics': 'PT',
    'sticky clay': 'CH',
    'fat clay': 'CH',
    'adobe' : 'CH',
    
    'lean clay' : 'CL',
    'silty clay' : 'CL',
    'lean clay': 'CL',
    'sandy clay' : 'CL',
    'hardpan' : 'ML',
    'silt' : 'ML',
    'sandy silt' : 'ML',
    'organic soil': 'OH',
    
    'silty soil': 'OH',
    'cobble' : 'GW',
    'boulder': 'GW',
    'decomposed granite': 'GW',

    'top soil' : 'OH',
    'topsoil' : 'OH',
    'sandy' : 'SW',
    
    'basalt' : 'volcanic',
    'basaltic' : 'volcanic',
    'pumice' : 'volcanic',
    'latite' : 'volcanic',
    'volcanics' : 'volcanic',
    'cinder' : 'volcanic',
    'ash' : 'volcanic',
    'tufa': 'volcanic',
    'tuff': 'volcanic',
    
    'granite' : 'igneous rock',
    'diorite': 'igneous rock',
    'Quartz': 'igneous rock',
    'Gabro': 'igneous rock',
    'gabbro': 'igneous rock',
    'quartzite': 'igneous rock',
    'granodiorite': 'igneous rock',
    'igneous': 'igneous rock',
    'andesite': 'igneous rock',
    'latite' : 'igneous rock',
    
    'greenstone' : 'metamorphic rock',
    'slate' : 'metamorphic rock',
    'schist' : 'metamorphic rock',
    'serpentine':'metamorphic rock',
    'metasediment':'metamorphic rock',
    'phylite':'metamorphic rock',
    'phyllite':'metamorphic rock',
    'argillite':'metamorphic rock',
    
    'sandstone' : 'sedimentary rock',
    'conglomerate' : 'sedimentary rock',
    'shale' : 'sedimentary rock',
    'siltstone' : 'sedimentary rock',
    'rock' : 'sedimentary rock',
    'graywacke': 'sedimentary rock',
    'limestone' : 'sedimentary rock',
    'siltstone': 'sedimentary rock',
    'claystone': 'sedimentary rock',
    'cobblestone': 'sedimentary rock',
    'mudstone': 'sedimentary rock',
    'Cemented gravel':'sedimentary rock',
    'chert' : 'sedimentary rock',
    
    'fractured' :'double-porosity rock',
    'fracture' :'double-porosity rock',
    'fractured rock':'double-porosity rock',
    'rock fractured':'double-porosity rock',

}

In [14]:
# Assuming classification_mapping is defined somewhere in your code
classification_mapping = {
    # Define your classification mapping here
}

def extract_uscs(text, current_uscs):
    # If there is already a value in the USCS column, return it
    if current_uscs and current_uscs.lower() != 'unknown':
        return current_uscs
    
    if isinstance(text, str):
        text = text.lower()  # Convert text to lower case
        text = re.sub(r'[^a-zA-Z0-9,%% ]', '', text)  # Clean text from non-alphabetic characters except numbers, commas, and percentage signs
        
        # Check for exact match in the dictionary
        if text in classification_mapping:
            return classification_mapping[text].upper()
        
        # Use rapidfuzz to find the closest match
        best_match = process.extractOne(text, classification_mapping.keys(), scorer=fuzz.token_sort_ratio)
        if best_match and best_match[1] > 75:  # Threshold can be adjusted
            return classification_mapping[best_match[0]].upper()

        # Check for USCS code within parentheses
        match = re.search(r'\((.*?)\)', text)
        if match:
            return match.group(1).upper()
    
    return 'unknown'

# Assuming df is your DataFrame and it has columns 'DESCRIPTION2' and 'USCS'

# Applying the extract_uscs function to the DataFrame
df['USCS'] = df.apply(lambda row: extract_uscs(row['NEW_DESCRIPTION'], row['USCS']), axis=1)

# Print the updated DataFrame
print(df.head())


        WCRNUMBER INTERVALSTART INTERVALEND OLD_DESCRIPTION     USCS  \
0  WCR2023-007668         450.0       490.0         granite  unknown   
1  WCR2023-007668         490.0       520.0            rock  unknown   
2  WCR2023-007668         520.0       540.0   clay and rock  unknown   
3  WCR2023-007668         540.0       570.0            rock  unknown   
4  WCR2023-007668         570.0       600.0         granite  unknown   

  NEW_DESCRIPTION COLORQUALIFIER TEXTUREQUALIFIER  
0         granite            NaN              NaN  
1            rock            NaN              NaN  
2  clay sand rock            NaN              NaN  
3            rock            NaN              NaN  
4         granite            NaN              NaN  


In [15]:
# Function to classify remaining unknowns based on DESCRIPTION2
def classify_remaining_unknowns(row):
    description = row['NEW_DESCRIPTION'].lower().strip()  # Ensure description is in lowercase and stripped of leading/trailing spaces
    
    # Special handling for descriptions starting with 'weathered' or 'decomposed'
    if description.startswith('weathered') or description.startswith('decomposed'):
        return 'GP-SP-ML'
    if description.startswith('boulder') :
        return 'GP'
    
    # Check for specific keywords at the beginning of the description
    if description.startswith('andesite'):
        return 'VOLCANIC ROCK'
    elif description.startswith('basalt'): return 'VOLCANIC ROCK'
    elif description.startswith('granite'): return 'IGNEOUS ROCK'
    elif description.startswith('quartzite'):  return 'IGNEOUS ROCK' 
    elif description.startswith('granodiorite'): return 'IGNEOUS ROCK'
    elif description.startswith('igneous'):  return 'IGNEOUS ROCK'
    elif description.startswith('slate'): return 'METAMORPHIC ROCK'
    elif description.startswith('sandstone'):  return 'SEDIMENTARY ROCK'
    elif description.startswith('diorite'):  return 'IGNEOUS ROCK'
    elif description.startswith('latite'): return 'VOLCANIC ROCK'
    elif description.startswith('mudstone'): return 'SEDIMENTARY ROCK'
    elif description.startswith('schist'):  return 'METAMORPHIC ROCK'
    elif description.startswith('serpentine'): return 'METAMORPHIC ROCK'
    elif description.startswith('greenstone'):  return 'METAMORPHIC ROCK'
    elif description.startswith('phyllite'): return 'METAMORPHIC ROCK'
    elif description.startswith('volcanic'):  return 'VOLCANIC ROCK'
    elif description.startswith('shale'):  return 'SEDIMENTARY ROCK'
    elif description.startswith('pumice'): return 'VOLCANIC ROCK'
    elif description.startswith('conglomerate'):  return 'SEDIMENTARY ROCK'
    elif description.startswith('graywacke'):  return 'SEDIMENTARY ROCK'
    elif description.startswith('limestone'):  return 'SEDIMENTARY ROCK'
    elif description.startswith('metasediment'):  return 'METASEDIMENTARY ROCK'
    elif description.startswith('chert'): return 'METASEDIMENTARY ROCK' 

    # Check if the description starts with 'rock'
    elif description.startswith('rock'):
        # Split the description into words
        words = description.split()
        if len(words) > 1:
            next_word = words[1]
            if next_word in classification_mapping:
                return classification_mapping[next_word]
           
    # Check for multiple words or lengthy descriptions
    if 'sandy clay gravel' in description:    return 'SC-GW'
    elif 'silty clay loam' in description:     return 'CL-ML'
    elif 'gravelly sand' in description:    return 'SP-GW'
    elif 'organic silt' in description:     return 'OH-SM'
    elif 'hardpan' in description:     return 'ML'
    elif 'loam' in description:     return 'ML'
    elif 'silty clay loam' in description:  return 'CL-OL'
    elif 'gravelly sand' in description:    return 'SP-GW'
    elif 'silt sandy clay sand' in description:    return 'ML-CL'
    elif 'silt clay sand silt' in description:      return 'ML'
    elif 'sandy loam clay sand gravel' in description:      return 'OL-SC-SP'
    elif 'clay gravel sand ' in description:      return 'GC-CL'
    elif 'clay sandy clay sand ' in description:      return 'CL'
    elif 'sand gravel' in description:  return 'SP-GP'
    
        
    elif 'gravelly clay' in description:    return 'GC'
    elif 'silt' in description:  return 'ML'
    elif 'boulder' in description:  return 'GP'
    elif 'clay' in description:  return 'CH'
    elif 'soil' in description:  return 'ML'
    elif 'organic' in description:  return 'PT'
    elif 'sand' in description:  return 'SW'
    elif 'gravel' in description:  return 'GW'
    elif 'peat' in description:  return 'PT'
    elif 'rock' in description:  return 'SEDIMENTARY ROCK'
    # Check for exact match in the classification_mapping dictionary
    for key in classification_mapping:
        if key in description:
            return classification_mapping[key]
    
    # Default to 'unknown' if no classification found
    return 'unknown'

# Apply the function to rows where USCS is still unknown
unknown_mask = df['USCS'] == 'unknown'
df.loc[unknown_mask, 'USCS'] = df.loc[unknown_mask].apply(classify_remaining_unknowns, axis=1)


In [16]:
# Function to classify remaining unknowns based on TEXTUREQUALIFIER
def classify_based_on_texture_qualifier(row):
    texture_qualifier = row['TEXTUREQUALIFIER'].lower().strip() if isinstance(row['TEXTUREQUALIFIER'], str) else ''

    # Check if the TEXTUREQUALIFIER contains 'fracture' or 'fractured'
    if 'fracture' in texture_qualifier or 'fractured' in texture_qualifier:
        return 'double-porosity Rock'
    
    return row['USCS']

# Apply the function to rows where USCS is still unknown
unknown_mask = df['USCS'] == 'unknown'
df.loc[unknown_mask, 'USCS'] = df.loc[unknown_mask].apply(classify_based_on_texture_qualifier, axis=1)

# Print the updated DataFrame
print(df.head())

        WCRNUMBER INTERVALSTART INTERVALEND OLD_DESCRIPTION              USCS  \
0  WCR2023-007668         450.0       490.0         granite      IGNEOUS ROCK   
1  WCR2023-007668         490.0       520.0            rock  SEDIMENTARY ROCK   
2  WCR2023-007668         520.0       540.0   clay and rock                CH   
3  WCR2023-007668         540.0       570.0            rock  SEDIMENTARY ROCK   
4  WCR2023-007668         570.0       600.0         granite      IGNEOUS ROCK   

  NEW_DESCRIPTION COLORQUALIFIER TEXTUREQUALIFIER  
0         granite            NaN              NaN  
1            rock            NaN              NaN  
2  clay sand rock            NaN              NaN  
3            rock            NaN              NaN  
4         granite            NaN              NaN  


In [17]:
# Save the updated DataFrame to a new CSV file
output_file = 'updated_geologiclog_freeform.csv'
df.to_csv(output_file, index=False)

print(f"Final updated CSV file saved as '{output_file}'")

Final updated CSV file saved as 'updated_geologiclog_freeform.csv'
