In [None]:
# Data Preprocessing to have a CSV file with the following columns and respective data in each row:
# which will be filtered later to have just useful columns and non dupplicated PMID

"""
df shape before cleaning:(74243, 77)
df shape after cleaning:(57560, 15)

Index(['PMID', 'STAT', 'DRDT', 'CTDT', 'PB', 'DP', 'TI', 'BTI', 'AB', 'CI',
       'FED', 'ED', 'FAU', 'AU', 'AD', 'LA', 'PT', 'PL', 'OTO', 'OT', 'EDAT',
       'CRDT', 'AID', 'OWN', 'DCOM', 'LR', 'IS', 'VI', 'IP', 'PG', 'LID',
       'DEP', 'TA', 'JT', 'JID', 'SB', 'MH', 'MHDA', 'PHST', 'PST', 'SO', 'GR',
       'PMC', 'MID', 'COIS', 'TT', 'RN', 'OID', 'SI', 'ISBN', 'CTI', 'CN',
       'FIR', 'IR', 'AUID', 'EIN', 'CIN', 'PS', 'FPS', 'CON', 'UOF', 'UIN',
       'RIN', 'IRAD', 'EFR', 'OAB', 'OABL', 'PMCR', 'CP', 'ECI', 'DRIN', 'RF',
       'EN', 'ROF', 'RPI', 'RPF', 'DDIN'],
      dtype='object')


Guide to the abbreviations:

    PMID: PubMed IDentifier - Unique identifier for a PubMed record.

    TI: Title - The title of the article.

    AB: Abstract - A brief summary of the article's content.

    PB: Publisher - The organization responsible for publishing the article.

    FAU: Full Author(s) - Full names of the authors.

    FED: Full Editor(s) - Full names of the editors.

    DP: Date of Publication - Date when the article was published.

    OTO: Other Term Owner - Owner of other terms.

    OT: Other Term - Additional terms or keywords associated with the article.

    OWN: Owner - Owner of the article.

    DCOM: Date Completed - Date when the article was completed.

    LR: Last Revision - Last revision date.

    JT: Journal Title - Full title of the journal.

    MH: MeSH Terms - Medical Subject Headings.

    ISBN: International Standard Book Number - ISBN of the article.

[Removed]    STAT: Status - Indicates the status of the publication.

[Removed]    DRDT: Date Received by Database Transfer - Date when the record was received by the database.

[Removed]    CTDT: Current Temporary Date - Current temporary date of the record.

[Removed]    BTI: Book Title Indicator - Indicates that the article is part of a book.

[Removed]    CI: Copyright Information - Information about the copyright holder.

[Removed]    ED: Editor - Abbreviation for the editor.

[Removed]    AU: Author - Abbreviation for the author.

[Removed]    AD: Author's Affiliation - Affiliation or institution of the author.

[Removed]    LA: Language - Language of the article.

[Removed]    PT: Publication Type - Type of publication (e.g., Review, Book Chapter).

[Removed]    PL: Place of Publication - Location where the article was published.

[Removed]    EDAT: Entrez Date - Date the record was added to the Entrez database.

[Removed]    CRDT: Create Date - Date the record was created.

[Removed]    AID: Article Identifier - Identifier associated with the article.

[Removed]    IS: Issue - Issue number of the journal.

[Removed]    VI: Volume - Volume number of the journal.

[Removed]    IP: Issue Part - Part number of the issue.

[Removed]    PG: Page - Page number.

[Removed]    LID: Location IDentifier - Identifier for the location of the article.

[Removed]    DEP: Date of Electronic Publication - Date of electronic publication.

[Removed]    TA: Journal Title (ISO abbreviation) - Title abbreviation of the journal.

[Removed]    JID: Journal ID - Identifier for the journal.

[Removed]    SB: Subset - Subset designation.

[Removed]    MHDA: MeSH Date - MeSH date.

[Removed]    PHST: Publication History Status - Publication history status.

[Removed]    PST: Publication Status - Publication status.

[Removed]    SO: Source - Source of the article.

[Removed]    GR: Grant - Grant information.

[Removed]    PMC: PubMed Central ID - Identifier for PubMed Central.

[Removed]    MID: Manuscript ID - Identifier for the manuscript.

[Removed]    COIS: Conflict of Interest Statement - Statement about potential conflicts of interest.

[Removed]    TT: Type of Test - Type of test.

[Removed]    RN: Registry Number - Registry number.

[Removed]    OID: Organization ID - Identifier for the organization.

[Removed]    SI: Secondary Source ID - Secondary source identifier.

[Removed]    CTI: Current Technology Information - Current technology information.

[Removed]    CN: Contract Number - Contract number.

[Removed]    FIR: Full Investigator(s) - Full names of the investigators.

[Removed]    IR: Investigator - Abbreviation for the investigator.

[Removed]    AUID: Author ID - Identifier for the author.

[Removed]    EIN: Editor's ID - Identifier for the editor.

[Removed]    CIN: Contributor ID - Identifier for the contributor.

[Removed]    PS: Personal Name as Subject - Personal name as subject.

[Removed]    FPS: Full Personal Name as Subject - Full personal name as subject.

[Removed]    CON: Consortium - Consortium information.

[Removed]    UOF: Use of Funds - Use of funds information.

[Removed]    UIN: Unique Identifier - Unique identifier.

[Removed]    RIN: Reviewer ID - Reviewer identifier.

[Removed]    IRAD: Investigator Affiliation Department - Investigator affiliation department.

[Removed]    EFR: EFS (Endoscopic Frequency Standardization) Factor - EFS factor.

[Removed]    OAB: Overall Bank - Overall bank.

[Removed]    OABL: Overall Blood - Overall blood.

[Removed]    PMCR: PubMed Central Release - PubMed Central release information.

[Removed]    CP: Clinical Progress - Clinical progress.

[Removed]    ECI: Early Career Investigator - Early career investigator.

[Removed]    DRIN: Dual Purpose Experimental Purpose Indicator - Dual-purpose experimental purpose indicator.

[Removed]    RF: Release Factor - Release factor.

[Removed]    EN: Endorsement - Endorsement.

[Removed]    ROF: Reviewer's Office - Reviewer's office.

[Removed]    RPI: Reviewer's Position Identifier - Reviewer's position identifier.

[Removed]    RPF: Research Performance Factor - Research performance factor.

[Removed]    DDIN: Degree-Degree Integration Network - Degree-degree integration network.
"""



import re
import pandas as pd

with open('../data_pool/articles.txt', 'r', encoding='utf-8') as f:
    input_text = f.read()

# Split articles based on double quotes
articles = re.split(r'\n"\n', input_text.strip())

# Define a function to extract data from each article
def extract_data(article):
    data = {}
    current_key = None
    current_value = ''

    for line in article.split('\n'):
        # matching the key-value pair
        match = re.match(r'^([A-Z]{2,4})\s*- (.+)$', line)

        if match:
            key, value = match.groups()
            if current_key:
                # If a key is already set, save the current value
                if current_key in data:
                    data[current_key] += '|' + current_value
                else:
                    data[current_key] =  current_value.strip()
                current_value = ''  # Reset current value

            current_key = key
            current_value = value
        else:
            # If there's no match, append the line to the current value
            current_value += '' + line.strip()

    # Save the last key-value pair
    if current_key:
        data[current_key] = current_value.strip()

    return data

# Extract data from each article
article_data_list = [extract_data(article) for article in articles]

# Filter out articles without 'AB' key
filtered_data_list = [data for data in article_data_list if 'AB' in data]

# Create a DataFrame from the filtered data
df = pd.DataFrame(filtered_data_list)

# Keep only useful columns:  df shape before cleaning:(74243, 77)
df = df[['PMID', 'TI', 'AB', 'PB', 'FAU', 'FED', 'DP', 'OTO', 'OT', 'OWN', 'DCOM', 'LR', 'JT', 'MH', 'ISBN']]

# Drop duplicates based on the 'PMID' column : df shape after cleaning:(57560, 15)
df = df.drop_duplicates(subset='PMID', keep='first')

# Save the DataFrame to a CSV file
df.to_csv('../data_pool/articles.csv', index=False)


In [None]:
# Conbine different columns of the dataset into one column
import pandas as pd

# Read the original CSV file
df_part = pd.read_csv('articles.csv', index_col='PMID', usecols=['PMID', 'TI', 'AB', 'FAU', 'DP', 'OT', 'JT', 'MH'])

# Create a new DataFrame with the desired structure
new_df = pd.DataFrame(index=df_part.index)

# Combine the information into a single column
new_df['Combined_Info'] = (
    'Title: ' + df_part['TI'].fillna('None') + '\n' +
    'Abstract: ' + df_part['AB'].fillna('None') + '\n' +
    'Authors: ' + df_part['FAU'].fillna('None') + '\n' +
    'Data of Publication: ' + df_part['DP'].fillna('None') + '\n' +
    'Terms or keywords associated with the article: ' + df_part['OT'].fillna('None') + '\n' +
    'Journal Title: ' + df_part['JT'].fillna('None') + '\n' +
    'Medical subject headings: ' + df_part['MH'].fillna('None')
)

# Save the new DataFrame to a CSV file
new_df.to_csv('combined_data.csv')

In [None]:
# Splitting the large CSV file into smaller chunks
import pandas as pd

def split_csv(input_csv, output_prefix, chunk_size):
    # Read the large CSV file into a pandas DataFrame
    df = pd.read_csv(input_csv)

    # Determine the number of chunks needed
    num_chunks = (len(df) // chunk_size) + 1

    # Split the DataFrame into chunks
    chunks = [df[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks)]

    # Save each chunk as a separate CSV file
    for i, chunk in enumerate(chunks):
        output_csv = f"{output_prefix}_{i + 1}.csv"
        chunk.to_csv(output_csv, index=False)
        print(f"Chunk {i + 1} saved to {output_csv}")

# Example usage
input_csv_path = 'data_1.csv'  # Replace with the path to your large CSV file
output_prefix = 'sub_data'  # Prefix for the output CSV files
chunk_size = 1000  # Number of rows per chunk

split_csv(input_csv_path, output_prefix, chunk_size)

In [None]:
# Create new CSV file with the filtered 'CD' column that contains useful information
# and is the main dataset to be used for the project
import pandas as pd

# Read the original CSV file
df_part = pd.read_csv('articles.csv', index_col='PMID', usecols=['PMID', 'TI', 'AB', 'FAU', 'DP', 'OT', 'JT', 'MH'])

# Create a new DataFrame with the desired structure
new_df = pd.DataFrame(index=df_part.index)

# Combine the information into a single column
new_df['CD'] = (
    'PMID: ' + df_part.index.astype(str) + '\n' +
    'Abstract: ' + df_part['AB'].fillna('None') + '\n' +
    'Title: ' + df_part['TI'].fillna('None') + '\n' +
    'Authors: ' + df_part['FAU'].fillna('None') + ',\n' +
    'Data of Publication: ' + df_part['DP'].fillna('None') + '\n' +
    'Terms or keywords associated with the article: ' + df_part['OT'].fillna('None') + '\n' +
    'Journal Title: ' + df_part['JT'].fillna('None') + '\n' +
    'Medical subject headings: ' + df_part['MH'].fillna('None') + '\n'# +
    # 'Abstract: ' + df_part['AB'].fillna('None')
)
new_df['source'] = 'https://pubmed.ncbi.nlm.nih.gov/' + df_part.index.astype(str)


import pandas as pd

# # Read your DataFrame from a CSV file
# df = pd.read_csv('your_dataframe.csv')

# Function to filter out lines ending with 'None' from a given text
def filter_lines(text):
    lines = text.split('\n')
    filtered_lines = [line for line in lines if not line.strip().endswith('None')]
    return ', '.join(filtered_lines)

# Apply the filtering function to each row in the 'CD' column
new_df['CD'] = new_df['CD'].apply(filter_lines)

# Save the new DataFrame to a CSV file
new_df.to_csv('additional_data.csv')
# Print the DataFrame with the filtered 'CD' column
new_df.head()
