# PGS Analysis from GitHub Clone
This notebook runs the full workflow after cloning your GitHub repository.

In [ ]:
# Step 0: Clone the repo (run once)
!git clone https://github.com/Arun21P/PGS001298_analysis.git

In [ ]:
from pathlib import Path
import gzip
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import os

In [ ]:
# Paths inside the cloned repo
PROJECT_ROOT = Path("/content/PGS001298_analysis")
DATA_DIR = PROJECT_ROOT / "data"
OUTPUT_DIR = PROJECT_ROOT / "output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [ ]:
def unzip_gz_file(gz_path, data_dir):
    gz_path = Path(gz_path)
    data_dir = Path(data_dir)
    txt_filename = gz_path.stem
    txt_path = data_dir / txt_filename
    if not txt_path.exists():
        with gzip.open(gz_path, 'rb') as f_in:
            with open(txt_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Unzipped file saved to: {txt_path}")
    else:
        print(f"Unzipped file already exists: {txt_path}")
    return txt_path

In [ ]:
def preprocess_pgs_data(file_path):
    df = pd.read_csv(file_path, sep='\t', comment='#')
    df = df.dropna(axis=1, how='all')
    df = df.dropna()
    df = df[df['hm_chr'].isin([str(i) for i in range(1,23)])]
    df['hm_chr'] = df['hm_chr'].astype(int)
    df['hm_pos'] = df['hm_pos'].astype(int)
    df['effect_weight'] = df['effect_weight'].astype(float)
    df['rsID'] = df['rsID'].astype(str)
    df = df.drop(columns=['chr_name', 'chr_position'])
    output_file = OUTPUT_DIR / 'PGS001298_hmPOS_GRCh38_cleaned.txt'
    df.to_csv(output_file, sep='\t', index=False)
    print(f"Cleaned DataFrame saved to {output_file}")
    return df

In [ ]:
def exploratory_analysis(clean_df):
    print('Number of variants:', clean_df.shape[0])
    print('Number of columns:', clean_df.shape[1])
    print('Unique chromosomes in dataset:', sorted(clean_df['hm_chr'].unique()))
    print('\nVariants per chromosome:')
    print(clean_df['hm_chr'].value_counts().sort_index())
    plt.figure(figsize=(6,4))
    plt.hist(clean_df['effect_weight'], bins=50)
    plt.xlabel('Effect Weight')
    plt.ylabel('Frequency')
    plt.title('Distribution of Effect Weight (All Chromosomes)')
    plt.show()

In [ ]:
def plot_chr_effect_weight_histogram(clean_df, chr_num=21):
    chr_df = clean_df[clean_df['hm_chr']==chr_num]
    if chr_df.empty:
        print(f'No variants found for chromosome {chr_num}.')
        return
    plt.figure(figsize=(6,4))
    plt.hist(chr_df['effect_weight'], bins=50)
    plt.xlabel('Effect Weight')
    plt.ylabel('Frequency')
    plt.title(f'Effect Weight Distribution on Chromosome {chr_num}')
    save_path = OUTPUT_DIR / f'chr{chr_num}_effect_weight_hist.png'
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    print(f'Plot saved to {save_path}')
    print(f'Number of variants on chromosome {chr_num}: {chr_df.shape[0]}')
    print(chr_df['effect_weight'].describe())

In [ ]:
# Step 1: Unzip file
gz_file = DATA_DIR / 'PGS001298_hmPOS_GRCh38.txt.gz'
txt_file = unzip_gz_file(gz_file, DATA_DIR)

In [ ]:
# Step 2: Preprocess
clean_df = preprocess_pgs_data(txt_file)

In [ ]:
# Step 3: Exploratory analysis
exploratory_analysis(clean_df)

In [ ]:
# Step 4: Chromosome 21 histogram
plot_chr_effect_weight_histogram(clean_df, chr_num=21)