In [3]:
import pandas as pd
import os

# File paths
file1_path = '/home/aricept094/mydata/csv-DigitalHea-set.csv'
file2_path = '/home/aricept094/mydata/scopus.csv'

# --- Approach 1: Concatenation (Stacking) ---
try:
    df1_concat = pd.read_csv(file1_path)
    df2_concat = pd.read_csv(file2_path)

    combined_df_concat = pd.concat([df1_concat, df2_concat], ignore_index=True) # ignore_index resets index

    # Save the concatenated file
    output_concat_path = '/home/aricept094/mydata/combined_concatenated.csv'
    combined_df_concat.to_csv(output_concat_path, index=False)
    print(f"Concatenated file saved to: {output_concat_path}")

except FileNotFoundError:
    print("Error: One or both of the input files were not found.")
except Exception as e:
    print(f"An error occurred during concatenation: {e}")


# --- Approach 2: Full Outer Join (Merging) ---
try:
    df1_merge = pd.read_csv(file1_path)
    df2_merge = pd.read_csv(file2_path)

    # Identify potential common columns for merging.
    # Let's try merging on 'Title' and 'Authors' (you might need to adjust based on your data)
    # and also consider 'DOI' if present and reliable in both.

    # Normalize column names for potential merging (optional but good practice)
    df1_merge.rename(columns={'Title': 'Title', 'Authors': 'Authors', 'DOI': 'DOI', 'PubMed ID': 'PMID'}, inplace=True) # Rename relevant columns in df1
    df2_merge.rename(columns={'Title': 'Title', 'Authors': 'Authors', 'DOI': 'DOI', 'PMID': 'PMID'}, inplace=True) # Rename relevant columns in df2

    # Full Outer Join on 'Title' and 'Authors' AND 'DOI' (you can adjust the keys)
    # We'll try merging on DOI first if available, then maybe Title and Authors.
    # For simplicity, let's just use DOI and Title for now. You might need to refine this.

    # Merge on DOI if DOI is available in both (adjust column names if needed)
    if 'DOI' in df1_merge.columns and 'DOI' in df2_merge.columns:
        combined_df_doi_merged = pd.merge(df1_merge, df2_merge, on='DOI', how='outer', suffixes=('_file1', '_file2'))
        output_merged_doi_path = '/home/aricept094/mydata/combined_merged_doi.csv'
        combined_df_doi_merged.to_csv(output_merged_doi_path, index=False)
        print(f"Merged file (on DOI) saved to: {output_merged_doi_path}")
    else:
        print("Warning: DOI column not found in both files, skipping DOI merge.")
        combined_df_doi_merged = None # or handle differently

    # If DOI merge was done, we have a result. If not, or if you want to also merge on Title (less reliable)
    if combined_df_doi_merged is None: # Or always do this in addition to DOI
        combined_df_title_merged = pd.merge(df1_merge, df2_merge, on='Title', how='outer', suffixes=('_file1', '_file2'))
        output_merged_title_path = '/home/aricept094/mydata/combined_merged_title.csv'
        combined_df_title_merged.to_csv(output_merged_title_path, index=False)
        print(f"Merged file (on Title) saved to: {output_merged_title_path}")


except FileNotFoundError:
    print("Error: One or both of the input files were not found.")
except Exception as e:
    print(f"An error occurred during merging: {e}")

print("Script execution completed.")

Concatenated file saved to: /home/aricept094/mydata/combined_concatenated.csv
Merged file (on DOI) saved to: /home/aricept094/mydata/combined_merged_doi.csv
Script execution completed.


In [5]:
import pandas as pd

file_path = '/home/aricept094/mydata/combined_merged_doi.csv'

try:
    merged_df = pd.read_csv(file_path)
    print("Successfully loaded combined_merged_doi.csv")

    # 1. Shape of the DataFrame
    print("\nDataFrame Shape (rows, columns):", merged_df.shape)

    # 2. Percentage of Missing Values (NaN) per column
    print("\nPercentage of Missing Values per Column:")
    null_percentages = merged_df.isnull().sum() / len(merged_df) * 100
    print(null_percentages.sort_values(ascending=False)) # Sort by % NaN descending

    # 3. Display the first few rows
    print("\nFirst 5 rows of the DataFrame:")
    print(merged_df.head())

    # 4. Column names to observe suffixes
    print("\nColumn Names (showing first few):")
    print(merged_df.columns[:20]) # Print first 20 column names

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred while reading or exploring the file: {e}")

  merged_df = pd.read_csv(file_path)


Successfully loaded combined_merged_doi.csv

DataFrame Shape (rows, columns): (1576586, 44)

Percentage of Missing Values per Column:
Open Access                      99.664465
Art. No.                         98.905737
DOI                              98.734798
NIHMS ID                         97.739673
Editors                          97.047798
ISBN                             95.353758
Author Keywords                  85.921732
Publisher                        85.116511
Correspondence Address           84.687229
CODEN                            83.962752
PMCID                            81.065162
Authors with affiliations        58.983525
Affiliations                     58.983525
Page count                       18.785147
Page end                         16.495072
Issue                            13.255287
Volume                           12.967260
PubMed ID                        12.858417
Authors_file2                     9.896067
Author(s) ID                      9.896067
Author