In [1]:
import os
import glob
import numpy as np
import pandas as pd

# import geopandas as gpd
# from shapely.geometry import Point

In [2]:
# File paths for your CSV files
file_paths = [
    '1-updated_geologiclog_freeform.csv',
    '2-updated_geologiclog_quick-pick.csv',
    '3-updated_geologiclog_USCS.csv',
    '4-updated_geologiclog_generalizedlithology.csv',
]

# Dictionary to store DataFrames loaded from each CSV file
dataframes = {}

# Load each CSV file into a DataFrame and store in the dictionary
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)  # Load the CSV file into a DataFrame
        dataframes[file_path] = df  # Add the DataFrame to the dictionary
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# Print the number of unique WCRNUMBER values for each DataFrame (each CSV file)
for name, df in dataframes.items():
    if 'WCRNUMBER' in df.columns:
        unique_count = df['WCRNUMBER'].nunique()  # Find count of unique values in WCRNUMBER column
        print(f"Number of unique WCRNUMBER values in {name}: {unique_count}")
    else:
        print(f"WCRNUMBER column not found in {name}")

# Concatenate all dataframes
if dataframes:
    merged_df = pd.concat(dataframes.values(), ignore_index=True)
    merged_df.head()
else:
    print("No dataframes to merge.")

  df = pd.read_csv(file_path)  # Load the CSV file into a DataFrame


Number of unique WCRNUMBER values in 1-updated_geologiclog_freeform.csv: 21580
Number of unique WCRNUMBER values in 2-updated_geologiclog_quick-pick.csv: 1778
Number of unique WCRNUMBER values in 3-updated_geologiclog_USCS.csv: 643
Number of unique WCRNUMBER values in 4-updated_geologiclog_generalizedlithology.csv: 180


In [3]:
print(merged_df.columns.tolist())

['WCRNUMBER', 'DECIMALLATITUDE', 'DECIMALLONGITUDE', 'Zland', 'X', 'Y', 'INTERVALSTART', 'INTERVALEND', 'DESCRIPTION', 'NEW_DESCRIPTION', 'KEYWORDS', 'COLOR QUALIFIER', 'TEXTURE QUALIFIER', 'USCS', 'HydraulicConductivity', 'AverageCoarseFraction', 'Avg Specific Yield (%)', 'Avg Ss (1/L)', 'Avg Kv (ft/day)', 'Unnamed: 0', 'DECIMALLAT', 'DECIMALLON', 'UTMX_y', 'UTMY_y', 'MATERIALTYPE', 'MATERIALCOLOR', 'MATERIALTEXTURE', 'MATERIALDESCRIPTION', 'COMBINED_MATERIALS', 'SOILCLASSIFICATION', 'SOILCOLOR', 'SOILDESCRIPTION', 'TEXTUREQUALIFIER', 'TEXTUREMODIFIER2', 'COLOR1', 'COLOR2', 'COLORQUALIFIER', 'CLASSIFICATION', 'TEXTURE_MODIFIED']


In [4]:
# Drop columns that are no longer needed
columns_to_drop = [
    'COLOR1', 'COLOR2', 'COLORQUALIFIER', 'CLASSIFICATION', 'TEXTURE_MODIFIED',
    'SOILDESCRIPTION', 'SOILCOLOR', 'SOILCLASSIFICATION', 'MATERIALDESCRIPTION',
    'COMBINED_MATERIALS', 'TEXTUREQUALIFIER', 'TEXTUREMODIFIER2', 'MATERIALTYPE',
    'Unnamed: 0', 'MATERIALTEXTURE', 'COLOR QUALIFIER', 'TEXTURE QUALIFIER',
    'NEW_DESCRIPTION', 'DESCRIPTION', 'MATERIALCOLOR',
    'DECIMALLAT', 'DECIMALLON', 
    'UTMX_y', 'UTMY_y', 'DECIMALLATITUDE', 'DECIMALLONGITUDE',
]

# Drop columns safely
merged_df.drop(columns=[col for col in columns_to_drop if col in merged_df.columns], inplace=True)

rename_mapping = {
    'UTMX_y': 'UTMX',
    'UTMY_y': 'UTMY',
    'HydraulicConductivity':'Average Hydraulic Conductivity (ft/day)',
    # Add as many as needed
}

# Rename the columns in the DataFrame
merged_df.rename(columns=rename_mapping, inplace=True)

merged_df.head()

Unnamed: 0,WCRNUMBER,Zland,X,Y,INTERVALSTART,INTERVALEND,KEYWORDS,USCS,Average Hydraulic Conductivity (ft/day),AverageCoarseFraction,Avg Specific Yield (%),Avg Ss (1/L),Avg Kv (ft/day)
0,WCR2018-004181,1069.953118,838644.742317,3881271.0,0.0,20.0,"sand,mudstone","SP,CLSN",4.14375,35.0,16.75,0.000543,0.2525
1,WCR2018-004181,1069.953118,838644.742317,3881271.0,20.0,40.0,"sand,gravel,mudstone","SP,GP,CLSN",75.269067,53.333333,17.5,0.000379,0.335
2,WCR2018-004181,1069.953118,838644.742317,3881271.0,40.0,60.0,"sand,clay,gravel","SP,CL,GP",75.272701,51.666667,17.333333,0.001017,0.3335
3,WCR2018-004181,1069.953118,838644.742317,3881271.0,60.0,80.0,"sand,clay,gravel","SP,CL,GP",75.272701,51.666667,17.333333,0.001017,0.3335
4,WCR2018-004181,1069.953118,838644.742317,3881271.0,80.0,100.0,"gravel,sand","GP,SP",112.90085,75.0,24.5,0.000275,0.5


In [5]:
# Calculate the total number of records in the merged_df DataFrame
total_records = len(merged_df)

# Print the total number of records
print(f"Total number of records in merged_df: {total_records}")

# Save the updated DataFrame to a new CSV file
output_file = '5-updated_OSWRCs.csv'
merged_df.to_csv(output_file, index=False)

# Confirm that the file was saved
print(f"Updated CSV file saved as '{output_file}'")

# Check if the 'WCRNUMBER' column exists in the merged_df
if 'WCRNUMBER' in merged_df.columns:
    unique_count = merged_df['WCRNUMBER'].nunique()  # Find count of unique values
    print(f"Number of unique WCRNUMBER values in merged_df: {unique_count}")
else:
    print("WCRNUMBER column not found in merged_df")

Total number of records in merged_df: 428282
Updated CSV file saved as '5-updated_OSWRCs.csv'
Number of unique WCRNUMBER values in merged_df: 23808


In [6]:
# File paths for your CSV files
file_paths1 = [
    '5-updated_OSWRCs.csv',
    '6-updated_Ramboll_WCRs.csv',
    '7-updated_AEM_CF.csv',
    '11-updated_SVSim.csv',
]

# Dictionary for renaming fields
rename_columns = {
    'WCRNUMBER': 'WellName',
    'GSE_ft': 'Zland',
    'AverageCoarseFraction': 'Coarse',
    'UTMX': 'X',
    'UTMY': 'Y',

}

# List to hold dataframes
dataframes = []

# Read each CSV file and append to the list
for file_path in file_paths1:
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Rename columns based on the dictionary
        df.rename(columns={col: new_col for col, new_col in rename_columns.items() if col in df.columns}, inplace=True)
        
        # Add a new column with the file name
        df['SourceFile'] = file_path
        
        # Append the DataFrame to the list
        dataframes.append(df)
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

# Concatenate all dataframes
if dataframes:
    merged_df1 = pd.concat(dataframes, ignore_index=True)
    
    # Drop rows where all columns are NaN
    merged_df1 = merged_df1.dropna(how='all')
    
if 'WellName' in merged_df1.columns:
    # Drop rows with missing WellName
    merged_df1 = merged_df1.dropna(subset=['WellName'])

    # Sort the data to ensure correct Point numbering
    merged_df1 = merged_df1.sort_values(by=[ 'X', 'Y', 'INTERVALEND'], ascending=[ True, True, True])
    
    # Create Well column (unique numeric codes for WellName)
    merged_df1['Well'] = merged_df1['WellName'].astype('category').cat.codes + 1
    
    # Create Point column (count of occurrences within each WellName in sorted order)
    merged_df1['Point'] = merged_df1.groupby('WellName').cumcount() + 1
    
    # Display the resulting DataFrame
    print(merged_df1.head())
else:
    print("No dataframes to merge.")

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


              WellName        Zland              X             Y  \
387541  WCR2023-007196  927.0387436  525069.817561  4.461330e+06   
387542  WCR2023-007196  927.0387436  525069.817561  4.461330e+06   
389429  WCR2017-004848  1354.260214  530512.503921  4.481274e+06   
389430  WCR2017-004848  1354.260214  530512.503921  4.481274e+06   
389425  WCR2022-011845  1443.940007  531436.615099  4.483232e+06   

        INTERVALSTART  INTERVALEND            KEYWORDS        USCS  \
387541            0.0         15.0      weathered,soil  SW-GW,TPSL   
387542           15.0        220.0     fractured,shale   FRAC,SHLE   
389429            0.0         65.0  decomposed granite       DGRNT   
389430           65.0        200.0   fractured,granite   FRAC,GRNT   
389425            0.0         38.0  decomposed granite       DGRNT   

        Average Hydraulic Conductivity (ft/day)  Coarse  ...  SSURGO  SWN  \
387541                                 0.050500    17.5  ...     NaN  NaN   
387542          

In [7]:
# Drop columns that are no longer needed, ignoring missing columns
merged_df1.drop(columns=[
        'SVSIM_PC', 'SSURGO', 'Local_ID', 'CASGEM_MSC', 'WCR_NO', 'NEW_DESCRIPTION', 'DESCRIPTION',
        'COLORQUALIFIER', 'geometry', 'SVSIM_NO', 'SVSIM_NAME', 'SVSIM_ID', 'SVSIM_PT', 'GSE_ft',
        'GeophCFstd', 'MeanRes', 'Upper', 'Lower', 'LINE_NO', 'LINE_NO_pa', 'SWN','UTMX','UTMY',
        'Interval', 'ModIndex', 'GeophCT', 'GeophCTstd', 'TEXTUREQUALIFIER','HydraulicConductivity',
        'KEYWORDS',
    ], inplace=True, errors='ignore')

# rename the intermediate columns if needed
merged_df1.rename(columns={'GeophCF': 'ClayFraction', 'Average Hydraulic Conductivity (ft/day)':'Kxy'
                         , 'Avg Ss (1/L)':'Ss','Avg Kv (ft/day)': 'Kv','Avg Specific Yield (%)': 'SY',
                          }, inplace=True)

# Display the resulting DataFrame
merged_df1.head()

Unnamed: 0,WellName,Zland,X,Y,INTERVALSTART,INTERVALEND,USCS,Kxy,Coarse,SY,Ss,Kv,SourceFile,ClayFraction,Well,Point
387541,WCR2023-007196,927.0387436,525069.817561,4461330.0,0.0,15.0,"SW-GW,TPSL",0.0505,17.5,6.5,0.0505,0.0005,5-updated_OSWRCs.csv,,307088,1
387542,WCR2023-007196,927.0387436,525069.817561,4461330.0,15.0,220.0,"FRAC,SHLE",21.26041,44.0,6.5,0.002693,0.0025,5-updated_OSWRCs.csv,,307088,2
389429,WCR2017-004848,1354.260214,530512.503921,4481274.0,0.0,65.0,DGRNT,217.5197,90.0,19.0,5.1e-05,0.5,5-updated_OSWRCs.csv,,290543,1
389430,WCR2017-004848,1354.260214,530512.503921,4481274.0,65.0,200.0,"FRAC,GRNT",21.260424,45.0,5.045,0.002525,0.0025,5-updated_OSWRCs.csv,,290543,2
389425,WCR2022-011845,1443.940007,531436.615099,4483232.0,0.0,38.0,DGRNT,217.5197,90.0,19.0,5.1e-05,0.5,5-updated_OSWRCs.csv,,304223,1


In [8]:
# Define the desired column order
desired_order = ['WellName', 'Well', 'Point', 'X', 'Y', 'Zland', 'INTERVALSTART','INTERVALEND', 'Coarse',
                'Kxy', 'SY', 'Ss', 'Kv',	
                ]

# Reorder the DataFrame
combined_df = merged_df1[desired_order]


# Display the reordered DataFrame
combined_df.head()

Unnamed: 0,WellName,Well,Point,X,Y,Zland,INTERVALSTART,INTERVALEND,Coarse,Kxy,SY,Ss,Kv
387541,WCR2023-007196,307088,1,525069.817561,4461330.0,927.0387436,0.0,15.0,17.5,0.0505,6.5,0.0505,0.0005
387542,WCR2023-007196,307088,2,525069.817561,4461330.0,927.0387436,15.0,220.0,44.0,21.26041,6.5,0.002693,0.0025
389429,WCR2017-004848,290543,1,530512.503921,4481274.0,1354.260214,0.0,65.0,90.0,217.5197,19.0,5.1e-05,0.5
389430,WCR2017-004848,290543,2,530512.503921,4481274.0,1354.260214,65.0,200.0,45.0,21.260424,5.045,0.002525,0.0025
389425,WCR2022-011845,304223,1,531436.615099,4483232.0,1443.940007,0.0,38.0,90.0,217.5197,19.0,5.1e-05,0.5


In [9]:
# Ensure INTERVALSTART and INTERVALEND are numeric
combined_df['INTERVALSTART'] = pd.to_numeric(combined_df['INTERVALSTART'], errors='coerce')
combined_df['INTERVALEND'] = pd.to_numeric(combined_df['INTERVALEND'], errors='coerce')

# Ensure INTERVALSTART is numeric
combined_df['INTERVAL_MIDPOINT'] = (combined_df['INTERVALSTART'] + combined_df['INTERVALEND']) / 2

# Verify the result by displaying the first few rows
combined_df[['INTERVALSTART', 'INTERVALEND','INTERVAL_MIDPOINT']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['INTERVALSTART'] = pd.to_numeric(combined_df['INTERVALSTART'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['INTERVALEND'] = pd.to_numeric(combined_df['INTERVALEND'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['INTERVAL_MID

Unnamed: 0,INTERVALSTART,INTERVALEND,INTERVAL_MIDPOINT
387541,0.0,15.0,7.5
387542,15.0,220.0,117.5
389429,0.0,65.0,32.5
389430,65.0,200.0,132.5
389425,0.0,38.0,19.0


In [None]:
# Save the updated DataFrame to a new CSV file
output_file = '8-updated_all.csv'
combined_df.to_csv(output_file, index=False)

print(f"Updated CSV file saved as '{output_file}'")