In this file, I want to bring all my data processing together that has to happen on this code 
This includes:
    		○ Reducing columns to only what is necessary (ie take out all membrane, cytoplasm stuff, and intensity stuff) 
		○ Take out all glass/stroma (if applicable) 
		○ Scale all the x and y values to convert from pixels --> microns 
		○ Specifically divide into IFNy+ and IFNy-, not just regular and IFNy+ for your T cells and NK cells 
		○ Do the "cell typing" and make sure that that is all good 
		○ Take out image names, patient ID's, stage of tumor, stuff specific to each study that can be found in that file name and other stuff using regex strings and stuff like that 
Down the line regroup by tiny region areas and do parallel processing of stuff 

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
# to update
# Specify the file location
location = #where file will come from 

# List of columns to be used from the CSV
reduced_titles = ['Image Location', 'XMin', 'XMax', 'YMin', 'YMax', 'MHCI (Opal 650) Positive', 'CD56 (Opal 570) Positive',
                 'CD8 (Opal 540) Positive', 'CD3 (Opal 520) Positive', 'CD56+CD3+', 'CD56+CD3-', 'CD8+IFNy+', 'CD3+IFNy+',
                 'CD56+IFNy+', 'CD8-CD3+', 'CD8+CD3+', 'CD8+CD3-', 'panCK+MHCI+', 'panCK+MHCI-', 'Cell Area (µm²)',
                 'IFNy (Opal 620) Positive', 'Classifier Label', 'PanCyto (Opal 690) Positive']

# Read the original data from the CSV
original_data = pd.read_csv(location, usecols=reduced_titles)

# Renaming columns
original_data = original_data.rename(columns={'Classifier Label': 'ClassifierLabel', 'Image Location': 'ImageLocation',
                                              "Cell Area (µm²)": 'CellArea'})

# Remove rows with 'glass' in ClassifierLabel
original_data = original_data[original_data.ClassifierLabel != 'glass']

# Extract 'PatientID' from 'ImageLocation' using regex
original_data["PatientID"] = original_data["ImageLocation"].str.extract(r'([^MAD]*[0-9][0-9]-[0-9][0-9][0-9])')

# Scale coordinates from pixels to microns
scale_factor = 0.49456 # this was a direct result of scaling done in Vectra Analysis 
original_data["XMin"] = original_data["XMin"] * scale_factor
original_data["XMax"] = original_data["XMax"] * scale_factor
original_data["YMin"] = original_data["YMin"] * scale_factor
original_data["YMax"] = original_data["YMax"] * scale_factor
original_data['XCenter'] = (original_data["XMin"] + original_data["XMax"]) / 2
original_data['YCenter'] = (original_data["YMin"] + original_data["YMax"]) / 2

# Define Cell Types of Interest by Multi Marker Assessment
original_data['CD56+CD3-'] = original_data['CD56'] - original_data['CD3']
original_data['CD56+CD3-'] = original_data['CD56+CD3-'].replace(-1, 0)
original_data['PanCyto+MHCI+'] = original_data['PanCyto'] / 2 + original_data['MHCI'] / 2
original_data['PanCyto+MHCI+'] = original_data['PanCyto+MHCI+'].replace(0.5, 0)
original_data['PanCyto+MHCI-'] = original_data['PanCyto'] - original_data['MHCI']
original_data['PanCyto+MHCI-'] = original_data['PanCyto+MHCI-'].replace(-1, 0)

# Create derived columns for specific cell types
cell_types = ['CD8-CD3+', 'CD8+CD3+', 'CD56+CD3-']
for cell_type in cell_types:
    activated_label = cell_type + 'IFNy+'
    inactivated_label = cell_type + 'IFNy-'
    original_data[activated_label] = original_data[cell_type] / 2 + original_data['IFNy'] / 2
    original_data[activated_label] = original_data[activated_label].replace(0.5, 0)
    original_data[inactivated_label] = original_data[cell_type] - original_data['IFNy']
    original_data[inactivated_label] = original_data[inactivated_label].replace(-1, 0)

# Handling missing data
cell_name_list = ['PanCyto+MHCI-', 'PanCyto+MHCI+', 'CD8-CD3+IFNy+', 'CD8-CD3+IFNy-', 'CD8+CD3+IFNy+', 'CD8+CD3+IFNy-',
                  'CD56+CD3-IFNy+', 'CD56+CD3-IFNy-', 'CD8', 'CD3']
for cell_name in cell_name_list:
    original_data.dropna(subset=[cell_name], inplace=True)

# Remove rows with specific 'PatientID' values
excluded_patient_ids = ["15-671", "17-234", "17-328", "17-408", "17-473", "17-504"] # these patient ID's were found in other investigation to have very low values  
original_data = original_data[~original_data['PatientID'].isin(excluded_patient_ids)]

# Select a subset of columns
subset_columns = ['ImageLocation', 'PatientID', 'XCenter', 'YCenter', 'ClassifierLabel', 'CellArea',
                  'CD8-CD3+IFNy+', 'CD8-CD3+IFNy-', 'CD8+CD3+IFNy+', 'CD8+CD3+IFNy-', 'CD56+CD3-IFNy+',
                  'CD56+CD3-IFNy-', 'PanCyto+MHCI+', 'PanCyto+MHCI-']
original_data = original_data.drop_duplicates(subset=subset_columns)

# Create a new column 'CellType' and group data
reduced_data = original_data
reduced_data["CellType"] = 0

def group_assignment_function(row):
    cell_types = ['PanCyto+MHCI-', 'PanCyto+MHCI+', 'CD8-CD3+IFNy+', 'CD8-CD3+IFNy-', 'CD8+CD3+IFNy+', 'CD8+CD3+IFNy-',
                  'CD56+CD3-IFNy+', 'CD56+CD3-IFNy-']
    for cell_type in cell_types:
        if row[cell_type] == 1:
            row.CellType = cell_type
    return row

grouped_df = reduced_data.apply(lambda x: group_assignment_function(x), axis=1)
grouped_df = grouped_df[grouped_df["CellType"] != 0]

# Export the grouped data to a CSV file
with open(# where you will save file to #, 'a') as f:
    grouped_df.to_csv(f)

    

SyntaxError: invalid syntax (3960156819.py, line 7)

In [None]:
# Read the edited data from the CSV
edited_data = pd.read_csv(# where you will save the file to; just checking to make sure everything is there; keep this in a separate kernel

print(len(edited_data))