In [1]:
import os
import glob
import numpy as np
import pandas as pd

In [2]:
# File paths for your CSV files
file_paths = [
    '1-updated_geologiclog_freeform.csv',
    '2-updated_geologiclog_quick-pick.csv',
    '3-updated_geologiclog_USCS.csv',
    '4-updated_geologiclog_generalizedlithology.csv',
]

# Dictionary to store DataFrames loaded from each CSV file
dataframes = {}

# Load each CSV file into a DataFrame and store in the dictionary
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)  # Load the CSV file into a DataFrame
        dataframes[file_path] = df  # Add the DataFrame to the dictionary
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# Print the number of unique WCRNUMBER values for each DataFrame (each CSV file)
for name, df in dataframes.items():
    if 'WCRNUMBER' in df.columns:
        unique_count = df['WCRNUMBER'].nunique()  # Find count of unique values in WCRNUMBER column
        print(f"Number of unique WCRNUMBER values in {name}: {unique_count}")
    else:
        print(f"WCRNUMBER column not found in {name}")

# Concatenate all dataframes
if dataframes:
    merged_df1 = pd.concat(dataframes.values(), ignore_index=True)
    merged_df1.head()
else:
    print("No dataframes to merge.")

Number of unique WCRNUMBER values in 1-updated_geologiclog_freeform.csv: 21531
Number of unique WCRNUMBER values in 2-updated_geologiclog_quick-pick.csv: 1778
Number of unique WCRNUMBER values in 3-updated_geologiclog_USCS.csv: 643
Number of unique WCRNUMBER values in 4-updated_geologiclog_generalizedlithology.csv: 180


In [3]:
#Merge well spatial data on 'WCRNUMBER'
df1 = pd.read_csv( "wellcompletionreports_Zland.csv")
merged_df = pd.merge(df1,merged_df1, on='WCRNUMBER', how='inner')
merged_df.head()

Unnamed: 0,OBJECTID *,WCRNUMBER,DECIMALLATITUDE,DECIMALLONGITUDE,Zland,DECIMALLAT,DECIMALLON,UTMX_y,UTMY_y,INTERVALSTART,...,SOILCLASSIFICATION,SOILCOLOR,SOILDESCRIPTION,TEXTUREQUALIFIER,TEXTUREMODIFIER2,COLOR1,COLOR2,COLORQUALIFIER,CLASSIFICATION,TEXTURE_MODIFIED
0,26,WCR2018-004181,35.0174,-119.289,1069.953118,35.0174,-119.289,838644.742317,3881271.0,0.0,...,,,,,,,,,,
1,26,WCR2018-004181,35.0174,-119.289,1069.953118,35.0174,-119.289,838644.742317,3881271.0,20.0,...,,,,,,,,,,
2,26,WCR2018-004181,35.0174,-119.289,1069.953118,35.0174,-119.289,838644.742317,3881271.0,40.0,...,,,,,,,,,,
3,26,WCR2018-004181,35.0174,-119.289,1069.953118,35.0174,-119.289,838644.742317,3881271.0,60.0,...,,,,,,,,,,
4,26,WCR2018-004181,35.0174,-119.289,1069.953118,35.0174,-119.289,838644.742317,3881271.0,80.0,...,,,,,,,,,,


In [4]:
# Drop columns that are no longer needed
merged_df.drop(columns=[
        'COLOR1' , 'COLOR2', 'COLORQUALIFIER', 'CLASSIFICATION','TEXTURE_MODIFIED' ,'SOILDESCRIPTION',
        'SOILCOLOR', 'SOILCLASSIFICATION' , 'MATERIALDESCRIPTION','COMBINED_MATERIALS','TEXTUREQUALIFIER','TEXTUREMODIFIER2',
        'MATERIALTYPE','Unnamed: 0','MATERIALTEXTURE','COLOR QUALIFIER','TEXTURE QUALIFIER',
        'NEW_DESCRIPTION', 'DESCRIPTION','MATERIALCOLOR','DECIMALLAT','DECIMALLON', 'Avg Kv (ft/day)','HydraulicConductivity',
        'Avg Specific Yield (%)','Avg Ss (1/L)','INTERVALSTART','DECIMALLATITUDE','DECIMALLONGITUDE',
            ], inplace=True)

rename_mapping = {
    'AverageCoarseFraction': 'Coarse',
    'INTERVALEND' : 'Depth',
    'UTMX_y' : 'X',
    'UTMY_y' : 'Y',
    'WCRNUMBER' : 'WellName',
    # Add as many as needed
}

# Rename the columns in the DataFrame
merged_df.rename(columns=rename_mapping, inplace=True)

merged_df.head()

Unnamed: 0,OBJECTID *,WellName,Zland,X,Y,Depth,KEYWORDS,USCS,Coarse
0,26,WCR2018-004181,1069.953118,838644.742317,3881271.0,20.0,"sand,mudstone","SP,CLSN",35.0
1,26,WCR2018-004181,1069.953118,838644.742317,3881271.0,40.0,"sand,gravel,mudstone","SP,GP,CLSN",53.333333
2,26,WCR2018-004181,1069.953118,838644.742317,3881271.0,60.0,"sand,clay,gravel","SP,CL,GP",51.666667
3,26,WCR2018-004181,1069.953118,838644.742317,3881271.0,80.0,"sand,clay,gravel","SP,CL,GP",51.666667
4,26,WCR2018-004181,1069.953118,838644.742317,3881271.0,100.0,"gravel,sand","GP,SP",75.0


In [5]:
# Calculate the total number of records in the merged_df DataFrame
total_records = len(merged_df)

# Print the total number of records
print(f"Total number of records in merged_df: {total_records}")

# Save the updated DataFrame to a new CSV file
output_file = 'OSWCRsT2PV2.csv'
merged_df.to_csv(output_file, index=False)

# Confirm that the file was saved
print(f"Updated CSV file saved as '{output_file}'")

# Check if the 'WCRNUMBER' column exists in the merged_df
if 'WellName' in merged_df.columns:
    unique_count = merged_df['WellName'].nunique()  # Find count of unique values
    print(f"Number of unique WCRNUMBER values in merged_df: {unique_count}")
else:
    print("WellName column not found in merged_df")

Total number of records in merged_df: 427675
Updated CSV file saved as 'OSWCRsT2PV2.csv'
Number of unique WCRNUMBER values in merged_df: 23759


In [6]:
# File paths for your CSV files
file_paths1 = [
    'OSWCRsT2PV2.csv',
    '6-updated_Ramboll_WCRs.csv',
    '7-updated_AEM_CF.csv',
    '11-updated_SVSim.csv',
]

# Dictionary for renaming fields
rename_columns = {
    'WCRNUMBER': 'WellName',
    'GSE_ft': 'Zland',
    'AverageCoarseFraction': 'Coarse',
    'UTMX': 'X',
    'UTMY': 'Y',
    'INTERVALEND': 'Depth',
    'Z': 'Zland',
}

# List to hold dataframes
dataframes = []

# Read each CSV file and append to the list
for file_path in file_paths1:
    try:
        df1 = pd.read_csv(file_path)
        df1.rename(columns={col: new_col for col, new_col in rename_columns.items() if col in df1.columns}, inplace=True)
        dataframes.append((file_path, df1))
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

# Concatenate all dataframes
if dataframes:
    combined_df = pd.concat([df1 for _, df1 in dataframes], ignore_index=True)

    if 'WellName' in combined_df.columns:
        combined_df = combined_df.dropna(subset=['WellName'])  # Drop rows with missing WellName
        combined_df['Well'] = combined_df['WellName'].astype('category').cat.codes + 1
        combined_df['Point'] = combined_df.groupby('WellName').cumcount() + 1

    print(combined_df.head())
else:
    print("No dataframes to merge.")

  df1 = pd.read_csv(file_path)


   OBJECTID *        WellName        Zland              X             Y  \
0        26.0  WCR2018-004181  1069.953118  838644.742317  3.881271e+06   
1        26.0  WCR2018-004181  1069.953118  838644.742317  3.881271e+06   
2        26.0  WCR2018-004181  1069.953118  838644.742317  3.881271e+06   
3        26.0  WCR2018-004181  1069.953118  838644.742317  3.881271e+06   
4        26.0  WCR2018-004181  1069.953118  838644.742317  3.881271e+06   

   Depth              KEYWORDS        USCS     Coarse DESCRIPTION  ...  \
0   20.0         sand,mudstone     SP,CLSN  35.000000         NaN  ...   
1   40.0  sand,gravel,mudstone  SP,GP,CLSN  53.333333         NaN  ...   
2   60.0      sand,clay,gravel    SP,CL,GP  51.666667         NaN  ...   
3   80.0      sand,clay,gravel    SP,CL,GP  51.666667         NaN  ...   
4  100.0           gravel,sand       GP,SP  75.000000         NaN  ...   

   SSURGO  SWN  Local_ID  CASGEM_MSC  WCR_NO  NEW_DESCRIPTION  COLORQUALIFIER  \
0     NaN  NaN       Na

In [7]:
# Drop columns that are no longer needed, ignoring missing columns
combined_df.drop(columns=[
        'SVSIM_PC', 'SSURGO', 'Local_ID', 'CASGEM_MSC', 'WCR_NO', 'NEW_DESCRIPTION', 'DESCRIPTION',
        'COLORQUALIFIER', 'geometry', 'SVSIM_NO', 'SVSIM_NAME', 'SVSIM_ID', 'SVSIM_PT','Average Hydraulic Conductivity (ft/day)',
        'GeophCFstd', 'MeanRes', 'Upper', 'Lower', 'LINE_NO', 'LINE_NO_pa', 'SWN','Avg Ss (1/L)','INTERVALSTART',
        'Interval', 'ModIndex', 'GeophCT', 'GeophCTstd', 'TEXTUREQUALIFIER', 'Avg Specific Yield (%)','Avg Kv (ft/day)',
        'KEYWORDS', 'HydraulicConductivity','USCS','DECIMALLATITUDE','Z',
    ], inplace=True, errors='ignore')

# Display the resulting DataFrame
combined_df.head()

Unnamed: 0,OBJECTID *,WellName,Zland,X,Y,Depth,Coarse,Well,Point
0,26.0,WCR2018-004181,1069.953118,838644.742317,3881271.0,20.0,35.0,202663,1
1,26.0,WCR2018-004181,1069.953118,838644.742317,3881271.0,40.0,53.333333,202663,2
2,26.0,WCR2018-004181,1069.953118,838644.742317,3881271.0,60.0,51.666667,202663,3
3,26.0,WCR2018-004181,1069.953118,838644.742317,3881271.0,80.0,51.666667,202663,4
4,26.0,WCR2018-004181,1069.953118,838644.742317,3881271.0,100.0,75.0,202663,5


In [8]:
# Define the desired column order
desired_order = ['WellName', 'Well', 'Point', 'X', 'Y', 'Zland', 'Depth', 'Coarse']

# Reorder the DataFrame
combined_df = combined_df[desired_order]

# Display the reordered DataFrame
print(combined_df.head())

         WellName    Well  Point              X             Y        Zland  \
0  WCR2018-004181  202663      1  838644.742317  3.881271e+06  1069.953118   
1  WCR2018-004181  202663      2  838644.742317  3.881271e+06  1069.953118   
2  WCR2018-004181  202663      3  838644.742317  3.881271e+06  1069.953118   
3  WCR2018-004181  202663      4  838644.742317  3.881271e+06  1069.953118   
4  WCR2018-004181  202663      5  838644.742317  3.881271e+06  1069.953118   

   Depth     Coarse  
0   20.0  35.000000  
1   40.0  53.333333  
2   60.0  51.666667  
3   80.0  51.666667  
4  100.0  75.000000  


In [9]:
# Save the updated DataFrame to a new TSV file
output_file = '8-1-updated_all_T2PV2.tsv'
combined_df.to_csv(output_file, sep='\t', index=False)

print(f"Updated TSV file saved as '{output_file}'")

Updated TSV file saved as '8-1-updated_all_T2PV2.tsv'
