CSV Merge

In [1]:
import pandas as pd

def merge_and_save_csv_files(file1, file2, output_path):
    """
    Merges two CSV files into one and saves the merged data as a CSV file.

    Parameters:
    file1 (str): Path to the first CSV file.
    file2 (str): Path to the second CSV file.
    output_path (str): Path to save the merged CSV file.

    Returns:
    DataFrame: The merged DataFrame.
    """
    
    # Read the two CSV files into DataFrames
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Merge the DataFrames (concatenate them along the rows)
    merged_df = pd.concat([df1, df2], ignore_index=True)

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_path, index=False)

    # Return the merged DataFrame
    return merged_df

# Example usage
file1 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data1.csv'
file2 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data2.csv'
output_path = '/notebooks/Mine-folder/output/FWU3/currentCSVs/HHEDcsv.csv'

# Merge the CSV files and save the result
merged_dataframe = merge_and_save_csv_files(file1, file2, output_path)

# Print some information about the merged DataFrame
print(merged_dataframe.info())
print(merged_dataframe.head())
print(merged_dataframe.tail())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     58 non-null     int64  
 1   network        58 non-null     object 
 2   station        58 non-null     object 
 3   location       58 non-null     int64  
 4   channel        58 non-null     object 
 5   starttime      58 non-null     object 
 6   endtime        58 non-null     object 
 7   sampling_rate  58 non-null     float64
 8   data           58 non-null     object 
dtypes: float64(1), int64(2), object(6)
memory usage: 4.2+ KB
None
   Unnamed: 0 network station  location channel            starttime  \
0           0      ZZ    FWU3        10     HHE  2019-12-26T00:00:00   
1           1      ZZ    FWU3        10     HHE  2019-11-02T00:00:00   
2           2      ZZ    FWU3        10     HHE  2019-12-22T00:00:00   
3           3      ZZ    FWU3        10     HHE  2019-12-15

In [3]:
import pandas as pd

def findUnnamedColumn(file_path):
    df = pd.read_csv(file_path)
    return 'Unnamed' in df.columns.tolist()

def main():
    file1 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data1.csv'
    file2 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data2.csv'

    print(f"Checking file: {file1}")
    hasUnnamed1 = findUnnamedColumn(file1)
    
    print(f"\nChecking file: {file2}")
    hasUnnamed2 = findUnnamedColumn(file2)

    if hasUnnamed1:
        print(f"\nFile '{file1}' contains the Unnamed column.")
    else:
        print(f"\nFile '{file1}' does not contain the Unnamed column.")

    if hasUnnamed2:
        print(f"\nFile '{file2}' contains the Unnamed column.")
    else:
        print(f"\nFile '{file2}' does not contain the Unnamed column.")

if __name__ == "__main__":
    main()

Checking file: /notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data1.csv

Checking file: /notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data2.csv

File '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data1.csv' does not contain the Unnamed column.

File '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data2.csv' does not contain the Unnamed column.


In [4]:
import pandas as pd

def merge_csv_files(file1, file2, output_path):
    # Read both CSV files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    print(f"Shape of df1: {df1.shape}")
    print(f"Columns in df1: {list(df1.columns)}")
    print(f"\nShape of df2: {df2.shape}")
    print(f"Columns in df2: {list(df2.columns)}")

    # Check for common columns
    common_columns = set(df1.columns).intersection(set(df2.columns))
    print(f"\nCommon columns: {common_columns}")

    # Concatenate DataFrames
    merged_df = pd.concat([df1, df2], axis=0, ignore_index=True)

    print(f"\nShape of merged_df: {merged_df.shape}")
    print(f"Columns in merged_df: {list(merged_df.columns)}")

    return merged_df

# Usage
file1 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data1.csv'
file2 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data2.csv'
output_path = '/notebooks/Mine-folder/output/FWU3/currentCSVs/HHEDcsv.csv'

# Merge the CSV files
merged_dataframe = merge_csv_files(file1, file2, output_path)

print("\nMerged DataFrame information:")
print(merged_dataframe.info())
print(merged_dataframe.head())

Shape of df1: (33, 9)
Columns in df1: ['Unnamed: 0', 'network', 'station', 'location', 'channel', 'starttime', 'endtime', 'sampling_rate', 'data']

Shape of df2: (25, 9)
Columns in df2: ['Unnamed: 0', 'network', 'station', 'location', 'channel', 'starttime', 'endtime', 'sampling_rate', 'data']

Common columns: {'network', 'channel', 'station', 'data', 'endtime', 'starttime', 'Unnamed: 0', 'sampling_rate', 'location'}

Shape of merged_df: (58, 9)
Columns in merged_df: ['Unnamed: 0', 'network', 'station', 'location', 'channel', 'starttime', 'endtime', 'sampling_rate', 'data']

Merged DataFrame information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     58 non-null     int64  
 1   network        58 non-null     object 
 2   station        58 non-null     object 
 3   location       58 non-null     int64  
 4   channel        58

In [5]:
import pandas as pd

def inspect_csv(file_path):
    # Read the CSV file with minimal processing
    df = pd.read_csv(file_path, nrows=5, header=None)
    
    print(f"\nInspecting {file_path}:")
    print(df.head())
    print("\nColumn names:")
    print(df.columns.tolist())

def main():
    file1 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data1.csv'
    file2 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data2.csv'

    inspect_csv(file1)
    inspect_csv(file2)

if __name__ == "__main__":
    main()


Inspecting /notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data1.csv:
     0        1        2         3        4                    5  \
0  NaN  network  station  location  channel            starttime   
1  0.0       ZZ     FWU3        10      HHE  2019-12-26T00:00:00   
2  1.0       ZZ     FWU3        10      HHE  2019-11-02T00:00:00   
3  2.0       ZZ     FWU3        10      HHE  2019-12-22T00:00:00   
4  3.0       ZZ     FWU3        10      HHE  2019-12-15T00:00:00   

                            6              7  \
0                     endtime  sampling_rate   
1  2019-12-26T23:59:59.996000          250.0   
2  2019-11-02T23:59:59.996000          250.0   
3  2019-12-22T23:59:59.996000          250.0   
4  2019-12-15T23:59:59.996000          250.0   

                                                 8  
0                                             data  
1  [ -2718   -384  -1595 ...  11287 -15821  -3071]  
2        [ 1313  1352  1389 ... -1601   130  1031]  
3    

In [10]:
import pandas as pd

def merge_csv_files(file1, file2, output_path):
    # Read the CSV files, assuming the first row is the header
    df1 = pd.read_csv(file1, header=0)
    df2 = pd.read_csv(file2, header=0)

    # Ensure consistent column names
    common_columns = set(df1.columns).intersection(set(df2.columns))
    df1 = df1[list(common_columns)]
    df2 = df2[list(common_columns)]

    # Concatenate the DataFrames
    merged_df = pd.concat([df1, df2], axis=0, ignore_index=True)

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_path, index=False)

    return merged_df

# Usage
file1 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data5.csv'
file2 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/merged_seismic_data6.csv'
output_path = '/notebooks/Mine-folder/output/FWU3/currentCSVs/HHZDcsv1.csv'

# Merge the CSV files
merged_dataframe = merge_csv_files(file1, file2, output_path)

print("\nMerged DataFrame information:")
print(merged_dataframe.info())
print(merged_dataframe.head())


Merged DataFrame information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   network        57 non-null     object 
 1   channel        57 non-null     object 
 2   station        57 non-null     object 
 3   data           57 non-null     object 
 4   endtime        57 non-null     object 
 5   starttime      57 non-null     object 
 6   Unnamed: 0     57 non-null     int64  
 7   sampling_rate  57 non-null     float64
 8   location       57 non-null     int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 4.1+ KB
None
  network channel station                                       data  \
0      ZZ     HHZ    FWU3  [ 2569  -655   825 ... -1025  -807  -208]   
1      ZZ     HHZ    FWU3  [ 1627   296 -2155 ... -2112  2282   628]   
2      ZZ     HHZ    FWU3        [-404 -159 -254 ... 1175 1265 1308]   
3      ZZ     HHZ    FWU3  [

In [14]:
import pandas as pd

def merge_csv_files(file1, file2, file3, output_path):
    # Read the CSV files, assuming the first row is the header
    df1 = pd.read_csv(file1, header=0)
    df2 = pd.read_csv(file2, header=0)
    df3 = pd.read_csv(file3, header=0)

    # Ensure consistent column names
    common_columns = set(df1.columns).intersection(set(df2.columns)).intersection(set(df3.columns))
    df1 = df1[list(common_columns)]
    df2 = df2[list(common_columns)]
    df3 = df3[list(common_columns)]

    # Concatenate the DataFrames
    merged_df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_path, index=False)

    return merged_df

# Usage
file1 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/HHEDcsv1.csv'
file2 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/HHNDcsv1.csv'
file3 = '/notebooks/Mine-folder/output/FWU3/currentCSVs/HHZDcsv1.csv'
output_path = '/notebooks/Mine-folder/output/FWU3/currentCSVs/FWU3_csv.csv'

# Merge the CSV files
merged_dataframe = merge_csv_files(file1, file2, file3, output_path)

print("\nMerged DataFrame information:")
print(merged_dataframe.info())
print(merged_dataframe.head())
print(merged_dataframe.tail())


Merged DataFrame information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173 entries, 0 to 172
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   network        173 non-null    object 
 1   channel        173 non-null    object 
 2   station        173 non-null    object 
 3   data           173 non-null    object 
 4   endtime        173 non-null    object 
 5   starttime      173 non-null    object 
 6   Unnamed: 0     173 non-null    int64  
 7   sampling_rate  173 non-null    float64
 8   location       173 non-null    int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 12.3+ KB
None
  network channel station                                             data  \
0      ZZ     HHE    FWU3  [ -2718   -384  -1595 ...  11287 -15821  -3071]   
1      ZZ     HHE    FWU3        [ 1313  1352  1389 ... -1601   130  1031]   
2      ZZ     HHE    FWU3        [-7410 -5221  3344 ... -3956  4921 11622]   
3