In [27]:
import os
import pandas as pd
from collections import Counter

def count_cell_types_in_folder(folder_path, output_file):
    cell_type_counter = Counter()
    
    # Iterate through all files in the specified folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith("_cell_type.csv"):
            file_path = os.path.join(folder_path, file_name)
            
            # Read the CSV file
            df = pd.read_csv(file_path)
            
            # Process the 'Cell Types' column if it exists
            if 'Cell Types' in df.columns:
                for cell_types in df['Cell Types'].dropna():
                    for cell_type in map(str.strip, cell_types.split(';')):
                        cell_type_counter[cell_type] += 1
    
    # Convert to DataFrame and sort by count
    sorted_counts = sorted(cell_type_counter.items(), key=lambda x: x[1], reverse=True)
    result_df = pd.DataFrame(sorted_counts, columns=['Cell Type', 'Count'])
    
    # Export the results to a CSV file
    result_df.to_csv(output_file, index=False)
    print(f"Results exported to {output_file}")

if __name__ == "__main__":
    target_folder = "count_result"
    output_csv = "cell_type_counts.csv"
    count_cell_types_in_folder(target_folder, output_csv)


Results exported to cell_type_counts.csv


In [38]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering

# Load dataset
unique_cell_df = pd.read_csv('cell_type_counts.csv')

# Preprocess: lowercase, remove unnecessary characters
def clean_cell_type(cell_type):
    cell_type = cell_type.lower().strip()
    cell_type = re.sub(r'[_\-+,/]', ' ', cell_type)  # Replace _ - , / + with space
    cell_type = re.sub(r'\s+', ' ', cell_type)  # Remove extra spaces
    cell_type = re.sub(r'(\bcd\d+)\+', r'\1', cell_type)  # Remove '+' from CD markers
    return cell_type

unique_cell_df['cell_type_clean'] = unique_cell_df['Cell Type'].apply(clean_cell_type)

# Define high-priority clusters (explicit mapping)
priority_clusters = {
    # B cells
    r'\bB cell\b': 0, r'\bB\b': 0, r'^B-': 0, r'^B_': 0,
    r'\bCD19\b': 0, r'\bCD20\b': 0,  # B cell markers

    # T cells
    r'\bT cell\b': 1, r'\bT\b': 1, r'^T-': 1, r'^T_': 1,
    r'\bCD3\b': 1, r'\bCD4\b': 1, r'\bCD8\b': 1,  # T cell markers

    # Monocytes & Macrophages
    r'\bmonocyte\b': 2, r'\bmacrophage\b': 2, r'\bmoDC\b': 2,

    # NK cells
    r'\bNK cell\b': 3, r'\bnatural killer\b': 3,

    # Dendritic cells
    r'\bdendritic cell\b': 4, r'\bDC\b': 4, r'plasmacytoid DC': 4,

    # Stem cells
    r'\bstem cell\b': 5, r'\bHSC\b': 5, r'hematopoietic': 5,

    # Granulocytes
    r'\bgranulocyte\b': 6, r'\bneutrophil\b': 6, r'eosinophil': 6, r'basophil': 6,
}

# Function to assign priority cluster based on predefined rules
def assign_priority_cluster(cell_type):
    for pattern, cluster_id in priority_clusters.items():
        if re.search(pattern, cell_type, re.IGNORECASE):
            return cluster_id
    return -1  # Default value for non-priority cells

# Assign clusters based on priority
unique_cell_df['priority_cluster'] = unique_cell_df['cell_type_clean'].apply(assign_priority_cluster)

# Separate priority and non-priority cells
priority_df = unique_cell_df[unique_cell_df['priority_cluster'] != -1]
non_priority_df = unique_cell_df[unique_cell_df['priority_cluster'] == -1]

# Convert non-priority cell types to TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(non_priority_df['cell_type_clean'])

# Improved clustering with Ward’s method
clustering_model = AgglomerativeClustering(
    n_clusters=None,  # Let distance_threshold determine clusters
    linkage='ward',  # Better for hierarchical clustering
    distance_threshold=2.0  # Adjust to control number of clusters
)

labels = clustering_model.fit_predict(tfidf_matrix.toarray())

# Assign clusters to non-priority cells
non_priority_df['cluster'] = labels + max(priority_clusters.values()) + 1  # Offset to avoid conflicts

# Merge priority and non-priority clusters
final_df = pd.concat([
    priority_df[['Cell Type', 'priority_cluster']].rename(columns={'priority_cluster': 'cluster'}),
    non_priority_df[['Cell Type', 'cluster']]
])

# Group cell types by clusters
grouped_clusters = final_df.groupby('cluster')['Cell Type'].apply(list).reset_index()

# Save the grouped clusters to a CSV file
grouped_clusters.to_csv('grouped_clusters.csv', index=False)

# Print the number of clusters
print(f"Total clusters: {len(grouped_clusters)}")


Total clusters: 81


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_priority_df['cluster'] = labels + max(priority_clusters.values()) + 1  # Offset to avoid conflicts
