# Feature Extractor

#### Note: Use the picasso kernel only

What is this used for? 
- This code extracts the different features from the datasets and stores it as a panda dataframe.
- This dataframe can then be used for next steps of data analysis. 

Workflow
- Define source folder location and files.
- Load the sector information file and make an array with the number of rows equal to the sector numbers. 
- Load each dataset.
    - Extract the unique numbers of entries and counts for each sector ID.
    - Attach this information in a new column. 
- Save the table in a file to be read by another script. 


In [None]:
#  Import Dependencies.

import os.path as _ospath
import os as _os
import numpy as _np
import pandas as _pd
import yaml as _yaml
from picasso import io as _io
from tqdm import tqdm

In [None]:
# Define the folder location and the file extension inside the folder.

folder = '' # Folder name for specific cell.
folder = _ospath.join(folder, 'Sectored')
file_extn = '.hdf5'
file_names = [f for f in _os.listdir(folder) if f.endswith(file_extn)]
windows = [1] # !!! Use only Odd Numbers !!! Defines the pixel window size from where the counts need to to extracted. 1 = 1px by 1px; 3 = 3px by 3px; 5 = 5px by 5px.

In [None]:
# Read the yaml information to define the number of sectors.

file_extn_yaml = '.yaml'
file_names_yaml = [f for f in _os.listdir(folder) if f.endswith(file_extn_yaml)]

for file_yaml in file_names_yaml:
    if 'sector_info' in file_yaml:
        with open(_ospath.join(folder, file_yaml), 'r') as f_yaml:
            sector_info = _yaml.load(f_yaml, Loader=_yaml.FullLoader)
            break

number_of_sectors = sector_info['number_of_sectors']

In [None]:
# Generate the dataframe.

data = _pd.DataFrame(index=range(1, number_of_sectors + 1))

In [None]:
# Funcions used in the script.

def counter(locs):
    sectors, counts = _np.unique(locs['sector_id'], return_counts=True)
    sector_count_dict = dict(zip(sectors, counts))
    return sector_count_dict

def save_data_csv(data, window, folder):
    file_name = 'data' + '_' + str(window) + '.csv'
    output_path = _ospath.join(folder, file_name)
    data.to_csv(output_path, index=True)

def calculate_neighborhood_counts(locs, window, protein_name):
    x_pixel_pos = locs['x_pixel_pos'][:]
    y_pixel_pos = locs['y_pixel_pos'][:]
    sector_id = locs['sector_id'][:]
    locs = _pd.DataFrame({'x_pixel_pos': x_pixel_pos, 'y_pixel_pos': y_pixel_pos, 'sector_id': sector_id})
    neighbor_size = int(window/2 - 0.5) # Defines how many pixels on each side do we have to take. 
    counts = []
    # Iterate over each unique pixel position
    unique_pixels = locs.groupby(['x_pixel_pos', 'y_pixel_pos'])
    for (x, y), group in tqdm(unique_pixels, desc=f'Calculating neighborhood counts for {protein_name} with {window} x {window} pixel window.'):
        # Filter points within a 3x3 neighborhood
        neighborhood = locs[(locs['x_pixel_pos'].between(x - neighbor_size, x + neighbor_size)) &
                          (locs['y_pixel_pos'].between(y - neighbor_size, y + neighbor_size))]
        count = len(neighborhood)
        sector = group['sector_id'].iloc[0]  # Get the sector_id for the center pixel
        counts.append((sector, count))
    # Convert to DataFrame
    return counts

In [None]:
# Extract the data from the files and save the data.
for window in windows: # Iterates through the defined windowing sizes and saves the output count files as data_{window}.csv in the same folder. 
    for file in file_names:
        fpath = _ospath.join(folder, file)
        locs, info = _io.load_locs(fpath)
        protein_name = file.split('_')[0]
        sector_count_dict = calculate_neighborhood_counts(locs, window, protein_name)
        sector_count_dict = dict(sector_count_dict)
        data[protein_name] = data.index.map(lambda sector_id: sector_count_dict.get(sector_id, _np.nan))
        # print(f'{protein_name} data extracted')

    # Sort data columns in order

    order = ['S2P', 'S5P', 'SC35', 'H3K4me3', 'H3K27ac', 'CTCF', 'H3K27me3', 'H3K9me3', 'Lamin']
    data = data[order]

    # print('The max values for each protein in a window are:')
    # print(data.max())

    save_data_csv(data, window, folder)
    print(f'Saved feature counts for window size {window} to {folder}/data_{window}.csv')