## Assignment 7: XML Parsing with PySpark and key information extraction from research articles.
In the ever-expanding realm of bioinformatics and biomedical research, extracting information from vast repositories of scientific literature is a crucial task. Assignment 7 is set on an exciting journey into the world of data science and natural language processing. The objective being to use PySpark to parse PubMed XML files and extract key information from research articles. 

### Introduction
Scientific literature, especially in the domain of molecular biology and biochemistry, is a goldmine of knowledge. PubMed, as one of the largest repositories of biomedical literature, offers a treasure trove of research articles. However, making sense of this wealth of information can be daunting. This assignment addresses this challenge by developing a script capable of processing PubMed XML files and organizing the data into a PySpark dataframe.

The key information that will be extracted includes:

- PubMed ID
- First Author
- Last Author
- Year published
- Title
- Journal Title
- Length of Abstract (if Abstract text is present).
- A column of references in a list variable, if references are present for the article.

Furthermore, this assignment also involves the creation of a second dataframe to answer specific questions such as:

- Number of articles per First Author
- Number of articles per Year
- Minimum, maximum, Average length of an abstract
- Average Number of articles per Journal Title per Year

### Deliverables
To successfully complete this assignment, this script should be able to take one or more XML files as input and perform the following tasks:

- Parse PubMed XML files into a PySpark dataframe.
- Extract and organize the specified information from the articles.
- Create a secondary dataframe to answer the provided questions.


### Run code
To execute the script, navigate to your terminal and use the following command:

```
python3 Assignment7.py
```

### output
Upon running the code, the script will generate two CSV files. The first CSV file will contain the parsed data from the PubMed XML files, while the second CSV file will contain the answers to the questions posed in this assignment. Both output files will be located in an "output" folder, which the script will create for your convenience.


In [22]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

def create_spark_session(app_name):
    return SparkSession.builder.appName(app_name).getOrCreate()

def create_articles_dataframe(spark, input_directory, file_limit=None):
    # Define the schema for the articles DataFrame
    schema = StructType([
        StructField("PubMedID", StringType(), nullable=True),
        StructField("FirstAuthor", StringType(), nullable=True),
        StructField("LastAuthor", StringType(), nullable=True),
        StructField("Year", StringType(), nullable=True),
        StructField("Title", StringType(), nullable=True),
        StructField("JournalTitle", StringType(), nullable=True),
        StructField("Abstract", StringType(), nullable=True),
        StructField("References", StringType(), nullable=True)
    ])

    # Create an empty DataFrame for articles with the defined schema
    articles_df = spark.createDataFrame([], schema=schema)

    # Initialize a counter
    file_count = 0

    # Iterate over XML files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".xml"):
            filepath = os.path.join(input_directory, filename)
            
            # Parse the XML file
            tree = ET.parse(filepath)
            root = tree.getroot()

            # Extract data from XML and create rows
            for article in root.findall(".//PubmedArticle"):
                pubmed_id = article.find(".//PMID").text
                first_author = article.find(".//AuthorList/Author[1]/LastName").text
                last_author = article.find(".//AuthorList/Author[last()]/LastName").text
                pub_year = article.find(".//PubDate/Year").text if article.find(".//PubDate/Year") is not None else "Unknown"
                title = article.find(".//ArticleTitle").text
                journal_title = article.find(".//Journal/Title").text
                abstract = article.find(".//Abstract/AbstractText")
                abstract_text = abstract.text if abstract is not None else ""
                
                # Extract references if available
                references = [ref.text for ref in article.findall(".//PubmedData/ReferenceList/Reference/ArticleIdList/ArticleId[@IdType='pubmed']")]

                # Append the data to the DataFrame
                articles_df = articles_df.union(
                    spark.createDataFrame([(pubmed_id, first_author, last_author, pub_year, title, journal_title, abstract_text, references)], 
                                         schema=schema)
                )

                # Increment the file count
                file_count += 1

                # Check if the file limit has been reached
                if file_limit is not None and file_count >= file_limit:
                    break
            
            # Check if the file limit has been reached
            if file_limit is not None and file_count >= file_limit:
                break

    return articles_df

def save_dataframe_as_csv(dataframe, output_path):
    dataframe.write.csv(output_path, mode="overwrite", header=True)

def create_and_save_analysis_dataframes(articles_df):
    # Create a second DataFrame to answer questions
    author_counts = articles_df.groupBy("FirstAuthor").count().alias("ArticleCountPerAuthor")
    year_counts = articles_df.groupBy("Year").count().alias("ArticleCountPerYear")
    abstract_lengths = articles_df.groupBy().agg(avg(col("Abstract").cast("string").cast(IntegerType())).alias("AvgAbstractLength"))
    journal_year_counts = articles_df.groupBy("JournalTitle", "Year").count().alias("ArticleCountPerJournalTitlePerYear")

    # Save the second DataFrames to CSV files
    save_dataframe_as_csv(author_counts, "output/author_counts.csv")
    save_dataframe_as_csv(year_counts, "output/year_counts.csv")
    save_dataframe_as_csv(abstract_lengths, "output/abstract_lengths.csv")
    save_dataframe_as_csv(journal_year_counts, "output/journal_year_counts.csv")
    
    
def combine_csv_files(input_folder, output_file):
    # Get a list of all CSV files in the input folder
    csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]

    if not csv_files:
        print("No CSV files found in the input folder.")
        return

    # Read the first CSV file to get the header
    first_csv = pd.read_csv(os.path.join(input_folder, csv_files[0]))
    header = list(first_csv.columns)

    # Create an empty DataFrame to store the combined data
    combined_df = pd.DataFrame(columns=header)

    # Append data from each CSV file to the combined DataFrame
    for csv_file in csv_files:
        csv_path = os.path.join(input_folder, csv_file)
        df = pd.read_csv(csv_path)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Save the combined DataFrame to the output CSV file
    combined_df.to_csv(output_file, index=False)
    print(f"Combined data saved to {output_file}")
    
def delete_files_except_combined_csv(folder_path, combined_csv_filename):
    try:
        for filename in os.listdir(folder_path):
            if filename != combined_csv_filename:
                file_path = os.path.join(folder_path, filename)
                if os.path.isfile(file_path):
                    os.remove(file_path)
        print(f"Deleted all files except {combined_csv_filename}")
    except Exception as e:
        print(f"An error occurred while deleting files: {str(e)}")


def main(input_directory, file_limit=None):
    # Initialize a Spark session
    spark = create_spark_session("PubMedParser")

    # Create articles DataFrame
    articles_df = create_articles_dataframe(spark, input_directory, file_limit)

    # Save the entire DataFrame to a single CSV file
    save_dataframe_as_csv(articles_df, "output/parsed_data.csv")

    # Create and save analysis DataFrames
    create_and_save_analysis_dataframes(articles_df)
    
    input_folder = "output/parsed_data.csv"
    output_file = "output/parsed_data.csv/combined_data.csv"
    combine_csv_files(input_folder, output_file)
    
    combined_csv_filename = "combined_data.csv"
    delete_files_except_combined_csv(input_folder, combined_csv_filename)

    # Stop the Spark session
    spark.stop()
    

# if __name__ == "__main__":
input_dir = "/data/datasets/NCBI/PubMed/"
# Set this to the desired file limit, or None to parse all files
# file_limit = sys.argv[1] if len(sys.argv) > 1 else None
file_limit = 5

main(input_dir, file_limit)

                                                                                

Combined data saved to output/parsed_data.csv/combined_data.csv
Deleted all files except combined_data.csv
