In [None]:
import pandas as pd
import glob
import os

combined_df = pd.read_csv('path to MasterVariantTableXtreme.txt', sep='\t', header=None, low_memory=False)

new_column_names = {
    0: 'Tumor_Sample_Barcode',
    1: 'Chromosome',
    2: 'Start_Position',
    3: 'ref',
    4: 'alt',
    5: 'DP',
    6: 'Ref_DP',
    7: 'Alt_DP',
    8: 'AF'
}

# Use the rename function to update column names
combined_df = combined_df.rename(columns=new_column_names)



filtered_dfs = []
for barcode in combined_df['Tumor_Sample_Barcode'].unique():
    # Extract rows for the current barcode
    barcode_df = combined_df[combined_df['Tumor_Sample_Barcode'] == barcode]

    # Apply the filtering conditions
    filtered_barcode_df = barcode_df[
        (barcode_df['AF'] < 20) &
        (barcode_df['AF'] > 2) &
        (barcode_df['DP'] > 200)
    ]

    if not filtered_barcode_df.empty:
      filtered_dfs.append(filtered_barcode_df)

# Concatenate all filtered dataframes
if filtered_dfs:
  final_df = pd.concat(filtered_dfs)
  print(final_df)
else:
  print("No data found matching the criteria.")


filtered_dfs2 = []
for barcode in combined_df['Tumor_Sample_Barcode'].unique():
    subset = combined_df[combined_df['Tumor_Sample_Barcode'] == barcode]
    filtered_subset = subset[
        (subset['AF'] > 40) & (subset['DP'] > 200)
    ]
    filtered_dfs2.append(filtered_subset)

final_df2 = pd.concat(filtered_dfs2, ignore_index=True)
final_df2

merged_df = pd.merge(final_df, final_df2, on=['Chromosome', 'Start_Position'], how='inner')
match_counts = merged_df.groupby(['Tumor_Sample_Barcode_x','Tumor_Sample_Barcode_y']).size().sort_values(ascending=False)
match_counts

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Assuming 'match_counts' is already defined from the previous code

with PdfPages('distribution_plots.pdf') as pdf:
    for barcode in match_counts.index.get_level_values(0).unique():
        barcode_counts = match_counts[barcode]

        plt.figure(figsize=(8, 6))
        barcode_counts.plot(kind='bar')
        plt.title(f"Distribution of Counts for {barcode}")
        plt.xlabel("Tumor_Sample_Barcode_y")
        plt.ylabel("Count")
        plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
        plt.tight_layout()  # Adjust layout to prevent labels from overlapping
        pdf.savefig()
        plt.close() # Close the plot to free up memory

print("Plots saved to distribution_plots.pdf")

         Tumor_Sample_Barcode Chromosome  Start_Position  \
4      CPDC2501252-SEQ-250136       chr1        26696449   
13     CPDC2501252-SEQ-250136       chr1        77966964   
15     CPDC2501252-SEQ-250136       chr1        77970017   
16     CPDC2501252-SEQ-250136       chr1        77970018   
17     CPDC2501252-SEQ-250136       chr1        77970018   
...                       ...        ...             ...   
23880  CPDV2501512-SEQ-250136       chrX        67545317   
23883  CPDV2501512-SEQ-250136       chrX        67545401   
23887  CPDV2501512-SEQ-250136       chrX        67546515   
23888  CPDV2501512-SEQ-250136       chrX        67546515   
23893  CPDV2501512-SEQ-250136       chrX        77599826   

                                  ref alt   DP  Ref_DP  Alt_DP         AF  \
4                                 CCG   -  925     892      33   3.567568   
13                                  A   -  323     260      63  19.504644   
15                                  -  AA  269  