In [1]:
import os
import glob
import numpy as np
import pandas as pd

In [2]:
# File paths for your CSV files
file_paths = [
    '1-updated_geologiclog_freeform.csv',
    '2-updated_geologiclog_quick-pick.csv',
    '3-updated_geologiclog_USCS.csv',
    '4-updated_geologiclog_generalizedlithology.csv',
]

# Dictionary to store DataFrames loaded from each CSV file
dataframes = {}

# Load each CSV file into a DataFrame and store in the dictionary
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)  # Load the CSV file into a DataFrame
        dataframes[file_path] = df  # Add the DataFrame to the dictionary
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# Print the number of unique WCRNUMBER values for each DataFrame (each CSV file)
for name, df in dataframes.items():
    if 'WCRNUMBER' in df.columns:
        unique_count = df['WCRNUMBER'].nunique()  # Find count of unique values in WCRNUMBER column
        print(f"Number of unique WCRNUMBER values in {name}: {unique_count}")
    else:
        print(f"WCRNUMBER column not found in {name}")

# Concatenate all dataframes
if dataframes:
    merged_df1 = pd.concat(dataframes.values(), ignore_index=True)
    merged_df1.head()
else:
    print("No dataframes to merge.")

  df = pd.read_csv(file_path)  # Load the CSV file into a DataFrame


Number of unique WCRNUMBER values in 1-updated_geologiclog_freeform.csv: 21580
Number of unique WCRNUMBER values in 2-updated_geologiclog_quick-pick.csv: 1778
Number of unique WCRNUMBER values in 3-updated_geologiclog_USCS.csv: 643
Number of unique WCRNUMBER values in 4-updated_geologiclog_generalizedlithology.csv: 180


In [3]:
print(merged_df1.columns.tolist())

['WCRNUMBER', 'DECIMALLATITUDE', 'DECIMALLONGITUDE', 'Zland', 'X', 'Y', 'INTERVALSTART', 'INTERVALEND', 'DESCRIPTION', 'NEW_DESCRIPTION', 'KEYWORDS', 'COLOR QUALIFIER', 'TEXTURE QUALIFIER', 'USCS', 'HydraulicConductivity', 'AverageCoarseFraction', 'Avg Specific Yield (%)', 'Avg Ss (1/L)', 'Avg Kv (ft/day)', 'Unnamed: 0', 'DECIMALLAT', 'DECIMALLON', 'UTMX_y', 'UTMY_y', 'MATERIALTYPE', 'MATERIALCOLOR', 'MATERIALTEXTURE', 'MATERIALDESCRIPTION', 'COMBINED_MATERIALS', 'SOILCLASSIFICATION', 'SOILCOLOR', 'SOILDESCRIPTION', 'TEXTUREQUALIFIER', 'TEXTUREMODIFIER2', 'COLOR1', 'COLOR2', 'COLORQUALIFIER', 'CLASSIFICATION', 'TEXTURE_MODIFIED']


In [4]:
# Drop columns that are no longer needed
columns_to_drop = [
    'COLOR1', 'COLOR2', 'COLORQUALIFIER', 'CLASSIFICATION', 'TEXTURE_MODIFIED',
    'SOILDESCRIPTION', 'SOILCOLOR', 'SOILCLASSIFICATION', 'MATERIALDESCRIPTION',
    'COMBINED_MATERIALS', 'TEXTUREQUALIFIER', 'TEXTUREMODIFIER2', 'MATERIALTYPE',
    'Unnamed: 0', 'MATERIALTEXTURE', 'COLOR QUALIFIER', 'TEXTURE QUALIFIER',
    'Avg Specific Yield (%)', 'NEW_DESCRIPTION', 'DESCRIPTION', 'MATERIALCOLOR',
    'DECIMALLAT', 'DECIMALLON', 'HydraulicConductivity', 'Avg Ss (1/L)',
    'UTMX_y', 'UTMY_y', 'Avg Kv (ft/day)', 'DECIMALLATITUDE', 'DECIMALLONGITUDE',
    'INTERVALSTART'
]

# Drop columns safely
merged_df1.drop(columns=[col for col in columns_to_drop if col in merged_df1.columns], inplace=True)


rename_mapping = {
    'AverageCoarseFraction':'Coarse',
    'INTERVALEND' :'Depth',
    'WCRNUMBER' : 'WellName' ,
    # Add as many as needed
}

# Rename the columns in the DataFrame
merged_df1.rename(columns=rename_mapping, inplace=True)

merged_df1.head()

Unnamed: 0,WellName,Zland,X,Y,Depth,KEYWORDS,USCS,Coarse
0,WCR2018-004181,1069.953118,838644.742317,3881271.0,20.0,"sand,mudstone","SP,CLSN",35.0
1,WCR2018-004181,1069.953118,838644.742317,3881271.0,40.0,"sand,gravel,mudstone","SP,GP,CLSN",53.333333
2,WCR2018-004181,1069.953118,838644.742317,3881271.0,60.0,"sand,clay,gravel","SP,CL,GP",51.666667
3,WCR2018-004181,1069.953118,838644.742317,3881271.0,80.0,"sand,clay,gravel","SP,CL,GP",51.666667
4,WCR2018-004181,1069.953118,838644.742317,3881271.0,100.0,"gravel,sand","GP,SP",75.0


In [5]:
# Calculate the total number of records in the merged_df1 DataFrame
total_records = len(merged_df1)

# Print the total number of records
print(f"Total number of records in merged_df1: {total_records}")

# Save the updated DataFrame to a new CSV file
output_file = 'OSWCRsT2PV2.csv'
merged_df1.to_csv(output_file, index=False)

# Confirm that the file was saved
print(f"Updated CSV file saved as '{output_file}'")

# Check if the 'WCRNUMBER' column exists in the merged_df1
if 'WellName' in merged_df1.columns:
    unique_count = merged_df1['WellName'].nunique()  # Find count of unique values
    print(f"Number of unique WellName values in merged_df1: {unique_count}")
else:
    print("WellName column not found in merged_df1")

Total number of records in merged_df1: 428282
Updated CSV file saved as 'OSWCRsT2PV2.csv'
Number of unique WellName values in merged_df1: 23808


In [6]:
# File paths for your CSV files
file_paths1 = [
    '11-updated_SVSim.csv',
    'OSWCRsT2PV2.csv',
    '6-updated_Ramboll_WCRs.csv',
    '7-updated_AEM_CF.csv',
]

# Dictionary for renaming fields
rename_columns = {
    'WCRNUMBER': 'WellName',
    'GSE_ft': 'Zland',
    'AverageCoarseFraction': 'Coarse',
    'UTMX': 'X',
    'UTMY': 'Y',
    'INTERVALEND': 'Depth',
}

# List to hold dataframes
dataframes = []

# Read each CSV file and append to the list
for file_path in file_paths1:
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Rename columns based on the dictionary
        df.rename(columns={col: new_col for col, new_col in rename_columns.items() if col in df.columns}, inplace=True)
        
        # Add a new column with the file name
        df['SourceFile'] = file_path
        
        # Append the DataFrame to the list
        dataframes.append(df)
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

# Concatenate all dataframes
if dataframes:
    merged_df1 = pd.concat(dataframes, ignore_index=True)
    
    # Drop rows where all columns are NaN
    merged_df1 = merged_df1.dropna(how='all')
    
if 'WellName' in merged_df1.columns:
    # Drop rows with missing WellName
    merged_df1 = merged_df1.dropna(subset=['WellName'])

    # Sort the data to ensure correct Point numbering
    merged_df1 = merged_df1.sort_values(by=[ 'X', 'Y', 'Depth'], ascending=[ True, True, True])
    
    # Create Well column (unique numeric codes for WellName)
    merged_df1['Well'] = merged_df1['WellName'].astype('category').cat.codes + 1
    
    # Create Point column (count of occurrences within each WellName in sorted order)
    merged_df1['Point'] = merged_df1.groupby('WellName').cumcount() + 1
    
    # Display the resulting DataFrame
    print(merged_df1.head())
else:
    print("No dataframes to merge.")

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


        SVSIM_NO        WellName SVSIM_ID  SVSIM_PT  SVSIM_PC  SSURGO  SWN  \
465177       NaN  WCR2023-007196      NaN       NaN       NaN     NaN  NaN   
465178       NaN  WCR2023-007196      NaN       NaN       NaN     NaN  NaN   
467065       NaN  WCR2017-004848      NaN       NaN       NaN     NaN  NaN   
467066       NaN  WCR2017-004848      NaN       NaN       NaN     NaN  NaN   
467061       NaN  WCR2022-011845      NaN       NaN       NaN     NaN  NaN   

       Local_ID CASGEM_MSC WCR_NO  ...  Avg Ss (1/L)  Avg Kv (ft/day)  \
465177      NaN        NaN    NaN  ...           NaN              NaN   
465178      NaN        NaN    NaN  ...           NaN              NaN   
467065      NaN        NaN    NaN  ...           NaN              NaN   
467066      NaN        NaN    NaN  ...           NaN              NaN   
467061      NaN        NaN    NaN  ...           NaN              NaN   

             SourceFile  ModIndex  GeophCT GeophCF LINE_NO  \
465177  OSWCRsT2PV2.csv       

In [7]:
# Drop columns that are no longer needed, ignoring missing columns
merged_df1.drop(columns=[
        'SVSIM_PC', 'SSURGO', 'Local_ID', 'CASGEM_MSC', 'WCR_NO', 'NEW_DESCRIPTION', 'DESCRIPTION',
        'COLORQUALIFIER', 'geometry', 'SVSIM_NO', 'SVSIM_NAME', 'SVSIM_ID', 'SVSIM_PT','Average Hydraulic Conductivity (ft/day)',
        'GeophCFstd', 'MeanRes', 'Upper', 'Lower', 'LINE_NO', 'LINE_NO_pa', 'SWN','Avg Ss (1/L)','INTERVALSTART',
        'Interval', 'ModIndex', 'GeophCT', 'GeophCTstd', 'TEXTUREQUALIFIER', 'Avg Specific Yield (%)','Avg Kv (ft/day)',
        'KEYWORDS', 'HydraulicConductivity','USCS','DECIMALLATITUDE','Z','ClayFraction',
    ], inplace=True, errors='ignore')

# Display the resulting DataFrame
merged_df1.head()

Unnamed: 0,WellName,X,Y,Zland,Depth,Coarse,SourceFile,GeophCF,Well,Point
465177,WCR2023-007196,525069.817561,4461330.0,927.0387436,15.0,17.5,OSWCRsT2PV2.csv,,307088,1
465178,WCR2023-007196,525069.817561,4461330.0,927.0387436,220.0,44.0,OSWCRsT2PV2.csv,,307088,2
467065,WCR2017-004848,530512.503921,4481274.0,1354.260214,65.0,90.0,OSWCRsT2PV2.csv,,290543,1
467066,WCR2017-004848,530512.503921,4481274.0,1354.260214,200.0,45.0,OSWCRsT2PV2.csv,,290543,2
467061,WCR2022-011845,531436.615099,4483232.0,1443.940007,38.0,90.0,OSWCRsT2PV2.csv,,304223,1


In [8]:
# Define the desired column order
desired_order = ['WellName', 'Well', 'Point', 'X', 'Y', 'Zland', 'Depth', 'Coarse']

# Reorder the DataFrame
combined_df = merged_df1[desired_order]

# Divide 'Coarse' by 100 and replace it with new values
combined_df['Coarse'] = combined_df['Coarse'] / 100

# Display the reordered DataFrame
combined_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['Coarse'] = combined_df['Coarse'] / 100


Unnamed: 0,WellName,Well,Point,X,Y,Zland,Depth,Coarse
465177,WCR2023-007196,307088,1,525069.817561,4461330.0,927.0387436,15.0,0.175
465178,WCR2023-007196,307088,2,525069.817561,4461330.0,927.0387436,220.0,0.44
467065,WCR2017-004848,290543,1,530512.503921,4481274.0,1354.260214,65.0,0.9
467066,WCR2017-004848,290543,2,530512.503921,4481274.0,1354.260214,200.0,0.45
467061,WCR2022-011845,304223,1,531436.615099,4483232.0,1443.940007,38.0,0.9


In [9]:
# Drop rows where the 'Zland' column contains '#VALUE!'
combined_df = combined_df[~combined_df['Zland'].astype(str).str.contains('#VALUE!', na=False)]

In [10]:
# Save the updated DataFrame to a new TSV file
output_file = '8-1-updated_all_T2PV2.tsv'
combined_df.to_csv(output_file, sep='\t', index=False)

print(f"Updated TSV file saved as '{output_file}'")

Updated TSV file saved as '8-1-updated_all_T2PV2.tsv'
