## This notebook is used to encode sample genotype datasets using PAGER (Phenotype-Adjusted Genotype Encoding and Ranking)

Dataset 1 - Continuous Phenotype  

Dataset 2 - Discrete Phenotype

### STEP 1 - Import all the required packages

In [20]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

### STEP 2 - Define PAGER ENCODING function

In [65]:
'''Function definition to encode the genotypes using the PAGER formula
    
    Input Parameters

    ----------------
    genotypes : pandas dataframe
        A dataframe containing the genotypes of the SNPs. The dataframe should have the SNPs as columns and the samples as rows.
    phenotype : pandas series
        A series containing the phenotypes of the samples. The series should have the samples as index and the phenotypes as values.

    Returns

    ----------------
    pager_encoded_genotypes : pandas dataframe
        A dataframe containing the PAGER encoded genotypes of the SNPs. The dataframe will have the SNPs as columns and the samples as rows.
    missing_genotypes_df : pandas dataframe
        A dataframe containing the SNPs with missing genotypes. The dataframe will have two columns, SNP_NAME and MISSING_GENOTYPES. The SNP_NAME column will have the SNP names and the MISSING_GENOTYPES column will have the missing genotypes for each SNP.
    
    '''
def pager_encode(genotypes, phenotype):

    # Create an empty dataframe to store the PAGER encoded genotypes
    pager_encoded_genotypes = pd.DataFrame()

    # Create an empty dataframe to store the SNPs with missing genotypes
    missing_genotypes_df = pd.DataFrame(columns=['SNP_NAME', 'MISSING_GENOTYPES'])
    
    # Iterate through the columns of the genotypes dataframe to calculate the PAGER encoding for each SNP
    for column in genotypes:

        # Extract the single SNP from the genotypes dataframe
        snp = genotypes[column]

        # Create a dataframe with the single SNP and the phenotype
        snp_df = pd.DataFrame({'genotype': snp.astype(float), 'phenotype': phenotype.astype(float)})

        # Identify unique genotypic classes, excluding NAs
        unique_genotypes = snp_df['genotype'].dropna().unique()

        # Check for missing genotypic classes
        missing_genotypes = set([0, 1, 2]) - set(unique_genotypes)

        # Print the missing genotypes and also add them to a dataframe to output at the end.
        if missing_genotypes:
            print(f"Missing genotypes for {snp.name}: {missing_genotypes}")
            snp_missing_genotype_df = pd.DataFrame({'SNP_NAME': [snp.name], 'MISSING_GENOTYPES': [', '.join(map(str, missing_genotypes))]})
            missing_genotypes_df = pd.concat([missing_genotypes_df, snp_missing_genotype_df], ignore_index=True)

        # Check if only one genotypic class is present
        if len(missing_genotypes) == 2:
            pager_encoded_genotypes[column] = genotypes[column]
            continue
            
        # Calculate the phenotypic mean per genotypic class
        geno_aggregations = snp_df.groupby('genotype').agg(
            mean_phenotype = ('phenotype', 'mean')
        )

        # Add the genotype values to the geno_aggregations dataframe for easy access
        geno_aggregations['genotype'] = geno_aggregations.index

        # Use the PAGER formula
        anchor_mean = geno_aggregations.loc[geno_aggregations['genotype'].idxmin(), 'mean_phenotype'] # anchor = 0 when all three genotypes are present
        geno_aggregations['rel_dist'] = (geno_aggregations['mean_phenotype'] - (anchor_mean))

        # Use Min-Max normalization on relative distance (rel_dist) to normalize the distance values
        scaler = MinMaxScaler()
        geno_aggregations['normalized_rel_dist'] = scaler.fit_transform(geno_aggregations['rel_dist'].values.reshape(-1, 1))

        # Append the PAGER encoded genotypes to the PAGER dataframe (pager_encoded_genotypes) 
        pager_encoded_genotypes[column] = genotypes[column].map(geno_aggregations.set_index('genotype')['normalized_rel_dist'])


    # Return the PAGER encoded genotypes and the missing genotypes dataframe
    return pager_encoded_genotypes, missing_genotypes_df

### STEP 3 - PAGER encode Dataset 1 - Continuous Phenotype

In [69]:
# Read the data from the csv file
data_continuous_phenotype = pd.read_csv('/Users/ghosha/Documents/VSCode Projects/pager/data/sample_data_cont.csv')

# Extract the genotypes and the phenotype from the data
genotype_columns = data_continuous_phenotype.iloc[:,:-1]
phenotype = data_continuous_phenotype.iloc[:,-1]

# Call the 'pager_encode' function to encode the genotypes using the PAGER formula
pager_encoded_genotype, missing_genotypes_df = pager_encode(genotype_columns, phenotype)

# Add the phenotype column as the last column to PAGER encoded data (pager_encoded_genotype)
pager_encoded_genotype = pd.concat([pager_encoded_genotype, phenotype], axis=1)

# Save the PAGER encoded data to a csv file
pager_encoded_genotype.to_csv('/Users/ghosha/Documents/VSCode Projects/pager/data/python_sample_data_cont_pager_encoded.csv', index=False)

# Save the missing genotypes dataframe to a csv file
missing_genotypes_df.to_csv('/Users/ghosha/Documents/VSCode Projects/pager/data/missing_genotypes.csv', index=False)

Missing genotypes for SNP4: {0}
Missing genotypes for SNP7: {0}
Missing genotypes for SNP21: {0}
Missing genotypes for SNP22: {0}
Missing genotypes for SNP25: {0}
Missing genotypes for SNP29: {0}
Missing genotypes for SNP33: {0}
Missing genotypes for SNP39: {0}
Missing genotypes for SNP61: {0, 1}
Missing genotypes for SNP86: {0}
Missing genotypes for SNP90: {0}
Missing genotypes for SNP91: {0}
Missing genotypes for SNP93: {0, 1}


### STEP 4 - PAGER encode Dataset 2 - Discrete Phenotype

In [68]:
# Read the data from the csv file
data_continuous_phenotype = pd.read_csv('/Users/ghosha/Documents/VSCode Projects/pager/data/sample_data_disc.csv')

# Extract the genotypes and the phenotype from the data
genotype_columns = data_continuous_phenotype.iloc[:,:-1]
phenotype = data_continuous_phenotype.iloc[:,-1]

# Call the 'pager_encode' function to encode the genotypes using the PAGER formula
pager_encoded_genotype, missing_genotypes_df = pager_encode(genotype_columns, phenotype)

# Add the phenotype column as the last column to PAGER encoded data (pager_encoded_genotype)
pager_encoded_genotype = pd.concat([pager_encoded_genotype, phenotype], axis=1)

# Save the PAGER encoded data to a csv file
pager_encoded_genotype.to_csv('/Users/ghosha/Documents/VSCode Projects/pager/data/sample_data_disc_pager_encoded.csv', index=False)

# Save the missing genotypes dataframe to a csv file
missing_genotypes_df.to_csv('/Users/ghosha/Documents/VSCode Projects/pager/data/missing_genotypes.csv', index=False)

Missing genotypes for SNP4: {0}
Missing genotypes for SNP7: {0}
Missing genotypes for SNP21: {0}
Missing genotypes for SNP22: {0}
Missing genotypes for SNP25: {0}
Missing genotypes for SNP29: {0}
Missing genotypes for SNP33: {0}
Missing genotypes for SNP39: {0}
Missing genotypes for SNP61: {0, 1}
Missing genotypes for SNP86: {0}
Missing genotypes for SNP90: {0}
Missing genotypes for SNP91: {0}
Missing genotypes for SNP93: {0, 1}
