### Load needed libraries

In [None]:
import os
import numpy as np
import anndata
import re
import pandas as pd
import seaborn as sns
import scanpy as sc
import copy
import glob
import pyreadstat
import matplotlib.pyplot as plt
from datetime import datetime
from igraph import *
import warnings

sc.settings.n_jobs = 32
warnings.filterwarnings("ignore")

pwd = os.getcwd()

In [None]:
def clean_quant_neuropath(region, pivot=True):
    quant_csvs = glob.glob(os.path.join(pwd, "input", "quant_neuropath", region, "raw", "*.csv"))
    original_names = []
    updated_names = []
    for i in quant_csvs:
        print("Processing " + os.path.basename(i))
        tmp = pd.read_csv(i)
        final_grey_matter = None
        if "Image Tag" in tmp.columns:
            tmp.drop(["Image Tag"], axis=1, inplace=True)
        replace_dict = {
            "(μm²)": "",
            "(μm)": "",
            "per μm²": "per area",
            "pg/ug": "",
            "tissue area": "area",
            "tissue\n": "area",
            "avg": "average",
            "  ": " ",
            "total of": "total",
            "positive bearing": "positive",
            "positive inclusion bearing": "positive",
            "total hematoxylin nuclei": "number of Hematoxylin positive nuclei",
            "hematoxylin stained": "Hematoxylin positive",
            "Hematoxylin positive average": "average Hematoxylin positive",
            "hematoxylin average nucleus roundness": "average hematoxylin positive nucleus roundness",
            "number 6e10 positive objects": "number of 6e10 positive objects",
            "average 6e10 positive objects area": "average 6e10 positive object area",
            "average 6e10 positive objects median diameter": "average 6e10 positive object median diameter",
            "iba1 positive and 6e10 positive": "Iba1 and 6e10 positive",
            "objects colocalized": "co-localized objects",
            "total iba1 positive cells": "number of Iba1 positive cells",
            "total activated iba1 positive cells": "number of activated Iba1 positive cells",
            "total inactivated iba1 positive cells": "number of inactivated Iba1 positive cells",
            "average gfap positive length": "average GFAP positive branch length",
            "at8": "AT8",
            "ptdp43": "pTDP43",
            "a-syn": "aSyn",
            "iba1": "Iba1",
            "gfap": "GFAP",
            "neun": "NeuN",
            "ttau": "tTau",
            "ptau": "pTau",
        }
        regex_dict = {
            "^(.*) tissue$": "\\1 area",
            "^(.*) $": "\\1"
        }

        original_names.extend(tmp.columns.to_list())
        tmp.columns = [str.lower(j) for j in tmp.columns]
        fixed_names = tmp.columns.to_list()
        for j,k in enumerate(tmp.columns):
            for l,m in replace_dict.items():
                fixed_names[j] = fixed_names[j].replace(l, m)
            for l,m in regex_dict.items():
                fixed_names[j] = re.sub(l, m, fixed_names[j])
        tmp.columns = fixed_names

        updated_names.extend(tmp.columns.to_list())

        
        for j in tmp.columns[tmp.columns.str.startswith("number of")]:
            if j.endswith("per area") == False:
                tmp[j + " per area"] = tmp[j] / tmp["area analyzed"]
        
        stain_name = {
            "AT-MTG-Summary Data-01-31-22.csv": "AT8 and pTDP43",
            "Hematoxylin-MTG-Summary data-01-29-22.csv": "Hematoxylin",
            "a-Syn-MTG-Summary data-01-29-22.csv": "aSyn",
            "I6-MTG-Summary data-08-10-22.csv": "Iba1 and 6e10",
            "NeuN-MTG-Summary data-01-26-22.csv": "NeuN",
            "GFAP-MTG-Summary Data-01-29-22.csv": "GFAP",
        }

        if "analysis region" in tmp.columns:
            tmp.loc[: "analysis region"] = tmp.loc[: "analysis region"].replace("Layer I", "Layer1")
            for j in tmp.columns:
                if j.startswith("number of") is True and j.endswith("per area") is False:
                    grey_matter = tmp.loc[:, [j, "case number"]].groupby("case number").sum()
                    grey_matter["analysis region"] = "Grey matter"
                    try:
                        final_grey_matter = pd.concat([final_grey_matter, grey_matter], axis=1)
                        final_grey_matter = final_grey_matter.loc[:, ~(final_grey_matter.columns.duplicated())]

                    except:
                        final_grey_matter = grey_matter.copy()
                elif j.startswith("total") is True:
                    grey_matter = tmp.loc[:, [j, "case number"]].groupby("case number").sum()
                    grey_matter["analysis region"] = "Grey matter"
                    try:
                        final_grey_matter = pd.concat([final_grey_matter, grey_matter], axis=1)
                        final_grey_matter = final_grey_matter.loc[:, ~(final_grey_matter.columns.duplicated())]
                    except:
                        final_grey_matter = grey_matter.copy()
                elif j == "area analyzed":
                    grey_matter = tmp.loc[:, [j, "case number"]].groupby("case number").sum()
                    grey_matter["analysis region"] = "Grey matter"
                    try:
                        final_grey_matter = pd.concat([final_grey_matter, grey_matter], axis=1)
                        final_grey_matter = final_grey_matter.loc[:, ~(final_grey_matter.columns.duplicated())]
                    except:
                        final_grey_matter = grey_matter.copy()
                elif j.startswith("percent") is True or j.startswith("average") is True:
                    grey_matter = tmp.loc[:, [j, "case number", "area analyzed"]]
                    grey_matter[j + "_weighted"] = grey_matter[j] * grey_matter["area analyzed"]
                    grey_matter = grey_matter.loc[:, [j + "_weighted", "area analyzed", "case number"]].groupby("case number").sum()
                    grey_matter[j] = grey_matter[j + "_weighted"] / grey_matter["area analyzed"]
                    grey_matter.drop([j + "_weighted", "area analyzed"], axis=1, inplace=True)
                    grey_matter["analysis region"] = "Grey matter"
                    try:
                        final_grey_matter = pd.concat([final_grey_matter, grey_matter], axis=1)
                        final_grey_matter = final_grey_matter.loc[:, ~(final_grey_matter.columns.duplicated())]
                    except:
                        final_grey_matter = grey_matter.copy()
                        
                elif j.startswith("number of") is True and j.endswith("per area") is True:
                    grey_matter = tmp.loc[:, [j, "case number", "area analyzed"]]
                    grey_matter[j + "_weighted"] = grey_matter[j] * grey_matter["area analyzed"]
                    grey_matter = grey_matter.loc[:, [j + "_weighted", "area analyzed", "case number"]].groupby("case number").sum()
                    grey_matter[j] = grey_matter[j + "_weighted"] / grey_matter["area analyzed"]
                    grey_matter.drop([j + "_weighted", "area analyzed"], axis=1, inplace=True)
                    grey_matter["analysis region"] = "Grey matter"
                    try:
                        final_grey_matter = pd.concat([final_grey_matter, grey_matter], axis=1)
                        final_grey_matter = final_grey_matter.loc[:, ~(final_grey_matter.columns.duplicated())]
                    except:
                        final_grey_matter = grey_matter.copy()
            
            final_grey_matter.reset_index(inplace=True)     
            tmp = pd.concat([tmp, final_grey_matter], axis=0)
            
        else:
            tmp["analysis region"] = "Grey matter"
            
        if "area analyzed" in tmp.columns:
            tmp[stain_name[os.path.basename(i)] + " area analyzed"] = tmp["area analyzed"].copy()
            tmp.drop(["area analyzed"], axis=1, inplace=True)
                    
        if pivot == True:
            tmp = tmp.pivot(index="case number", columns=["analysis region"])
            tmp.columns = ['_'.join(j) for j in tmp.columns.values]
            if tmp.index.name != "case number":
                tmp.index = tmp["case number"].copy()
                tmp.drop("case number", axis=1, inplace=True)
                    
            tmp = tmp.sort_index()

        else:
            tmp.index = pd.Index(tmp.loc[:, ["case number", "analysis region"]])
            tmp.index = pd.MultiIndex.from_tuples(tmp.index, names=["case number", "analysis region"])
            tmp.drop(["case number", "analysis region"], axis=1, inplace=True)
            tmp = tmp.sort_index()
     
        try:
            final_table = final_table.merge(tmp, left_index=True, right_index=True, how="left")
        
        except UnboundLocalError:
            final_table = tmp.copy()

    final_table.reset_index(inplace=True)
    if pivot == True: 
        savefile = os.path.join(pwd, "input", "quant_neuropath", region, "processed", "all_quant_neuropath_by_donor_pivoted." + str(datetime.date(datetime.now())) + ".csv")
    
    else:
        savefile = os.path.join(pwd, "input", "quant_neuropath", region, "processed", "all_quant_neuropath_by_donor." + str(datetime.date(datetime.now())) + ".csv")
    pd.DataFrame([original_names, updated_names]).T.to_csv("original_to_new.csv")
    final_table.to_csv(savefile)
    

In [None]:
clean_quant_neuropath("MTG", pivot=False)

In [None]:
clean_quant_neuropath("MTG")