In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re

In [2]:
# Define paths for the three journals
paths = {
    "Social Networks": "Social Networks/WOS data/",
    "Network Science": "Network Science/WOS data/",
    "Journal of Complex Networks": "Journal of Complex Networks/WOS data/"
}

# Function to process WOS files in a given directory
def process_wos_files(input_directory):
    papers = []  # List to store processed records
    for filename in os.listdir(input_directory):
        if filename.endswith(".txt"):  # Only process .txt files
            filepath = os.path.join(input_directory, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as file:
                    data = file.read()
                    # Split the content into individual records using the delimiter 'ER'
                    records = data.split("\nER\n")
                    for record in records:
                        if not record.strip():  # Skip empty records
                            continue
                        paper = {}
                        # Extract fields safely
                        paper['Document Type'] = re.search(r"PT (.+)", record).group(1) if re.search(r"PT (.+)", record) else None
                        paper['Authors'] = re.findall(r"AU (.+)", record)  # List of authors
                        paper['Title'] = re.search(r"TI (.+)", record).group(1) if re.search(r"TI (.+)", record) else None
                        paper['Journal'] = re.search(r"SO (.+)", record).group(1) if re.search(r"SO (.+)", record) else None
                        paper['Year'] = re.search(r"PY (.+)", record).group(1) if re.search(r"PY (.+)", record) else None
                        
                        # Safely extract citation count
                        tc_match = re.search(r"TC (\d+)", record)
                        paper['Citations'] = int(tc_match.group(1)) if tc_match else 0

                        # Extract keywords (combine 'DE' and 'ID' fields)
                        paper['Keywords'] = re.findall(r"DE (.+)", record) + re.findall(r"ID (.+)", record)

                        # Extract DOI
                        paper['DOI'] = re.search(r"DI (.+)", record).group(1) if re.search(r"DI (.+)", record) else None

                        # Extract affiliations
                        paper['Affiliations'] = re.findall(r"RP (.+)", record)

                        # Extract references (list of cited references)
                        paper['References'] = re.findall(r"CR (.+)", record)

                        # Add the processed paper to the list
                        papers.append(paper)
            except Exception as e:
                print(f"Error processing file {filepath}: {e}")
    return papers

# Process each journal and save results
for journal, path in paths.items():
    print(f"Processing files for {journal}...")
    try:
        journal_data = process_wos_files(path)  # Process WOS files for the journal
        df = pd.DataFrame(journal_data)  # Convert the list of papers into a DataFrame

        # Save the processed data to a CSV file
        output_file = f"{journal.replace(' ', '_')}_data.csv"
        df.to_csv(output_file, index=False)
        print(f"Data for {journal} saved to {output_file}")
    except Exception as e:
        print(f"Error processing {journal}: {e}")


Processing files for Social Networks...
Data for Social Networks saved to Social_Networks_data.csv
Processing files for Network Science...
Data for Network Science saved to Network_Science_data.csv
Processing files for Journal of Complex Networks...
Data for Journal of Complex Networks saved to Journal_of_Complex_Networks_data.csv
