In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Step 1: Determine the total number of rows and skip the last row
total_rows = sum(1 for row in open('encoded_file.csv')) - 1  # Ignore the last row

# Load the phenotype vector (DTF column from the 2nd column in Phenotype_DTF_2019.csv)
phenotype_vector = pd.read_csv("Phenotype_DTF_2019.csv").iloc[:, 1]

# Initialize lists to store results
performance_scores = []
chromosomes = []
positions = []
processed_snps = []  # To keep track of successfully processed SNP names

# Define the scaler once
scaler = StandardScaler()

# Step 2: Read the SNP file in chunks (10 rows at a time, excluding the last row)
chunk_size = 10  # Fixed chunk size of 10 rows
snp_reader = pd.read_csv('encoded_file.csv', chunksize=chunk_size, iterator=True)

# Initialize the number of processed rows
processed_rows = 0

# Step 3: Process 10 rows in each chunk, and iterate over SNP columns one by one
for chunk in snp_reader:
    # If the last chunk goes beyond the row limit, trim it to avoid processing the last row
    if processed_rows + len(chunk) > total_rows:
        chunk = chunk.iloc[:total_rows - processed_rows, :]

    # Drop the first index column (assumed to be the index) and iterate over SNP columns
    for snp in chunk.columns[1:]:
        # Extract chromosome and position from SNP name (assuming 'chromosome_position' format)
        chrom, pos = snp.split('_')
        chrom = int(chrom[1:])  # Assumes SNP names are like 'C1_1234'
        pos = int(pos)

        # Extract the SNP column for this chunk, reshape, and scale the SNP data
        X_snp = chunk[snp].values.reshape(-1, 1)  # SNP data from the chunk (up to 10 rows)
        X_snp_scaled = scaler.fit_transform(X_snp)

        # Ensure the slice of the phenotype vector matches the number of rows in the SNP data
        phenotype_chunk = phenotype_vector.iloc[processed_rows:processed_rows + len(X_snp_scaled)].values

        # Check if lengths match; if not, skip this SNP column
        if len(X_snp_scaled) != len(phenotype_chunk):
            print(f"Skipping SNP '{snp}' due to mismatched lengths.")
            continue

        # Train the SVR model
        svr_model = SVR(kernel='linear')  # Use linear kernel for simplicity
        svr_model.fit(X_snp_scaled, phenotype_chunk)

        # Calculate the performance score (MSE in this case)
        predictions = svr_model.predict(X_snp_scaled)
        mse = mean_squared_error(phenotype_chunk, predictions)
        
        # Append data only if successfully processed
        performance_scores.append(mse)
        chromosomes.append(chrom)
        positions.append(pos)
        processed_snps.append(snp)  # Track the SNP name

    # Update processed rows count after each chunk
    processed_rows += len(chunk)

# Step 4: Create DataFrame with successfully processed SNP names, performance scores, chromosomes, and positions
results_df = pd.DataFrame({
    'SNP': processed_snps,
    'performance_score': performance_scores,
    '-log10(Performance)': -np.log10(performance_scores),
    'Chromosome': chromosomes,
    'Position': positions
})

# Sort by Chromosome and Position
results_df = results_df.sort_values(['Chromosome', 'Position'])

# Select the top 200 SNPs by performance
top_200_snps = results_df.nlargest(200, '-log10(Performance)')

# Save chromosome and position information of top 200 SNPs to CSV
top_200_snps[['SNP', 'Chromosome', 'Position']].to_csv("top_200_snp_positions.csv", index=False)

# Set up figure size and axis for Manhattan plot
plt.figure(figsize=(12, 6))

# Create 'chromosome position' to spread chromosomes across the x-axis
top_200_snps['Chromosome_Position'] = top_200_snps.groupby('Chromosome').cumcount() + 1

# Create a color map to assign a different color to each chromosome
colors = plt.cm.get_cmap('tab20', 20)

# Plot data for each chromosome separately to apply different colors
x_labels = []
x_ticks = []
cumulative_position = 0

for chrom in top_200_snps['Chromosome'].unique():
    chrom_data = top_200_snps[top_200_snps['Chromosome'] == chrom]
    plt.scatter(chrom_data['Chromosome_Position'] + cumulative_position, 
                chrom_data['-log10(Performance)'], 
                color=colors(chrom - 1), label=f'Chr {chrom}', s=10, edgecolor='k')
    
    # Add mid-point of each chromosome for labeling
    x_labels.append(f'Chr {chrom}')
    x_ticks.append(cumulative_position + (len(chrom_data) // 2))
    
    # Update cumulative position
    cumulative_position += len(chrom_data)

# Customize the plot without chromosome and position labels
plt.title('Manhattan Plot for Top 200 SNPs Based on Performance')
plt.xlabel('Chromosome')
plt.ylabel('-log10(Performance)')
plt.grid(True)

# Add x-ticks at the midpoints of each chromosome's SNP positions
plt.xticks(ticks=x_ticks, labels=x_labels, rotation=45, ha='left')

# Save the plot as a PNG image
plt.tight_layout()
plt.savefig('top_200_manhattan_plot.png', dpi=300)  # Save with high resolution (300 DPI)

# Show the plot
plt.show()
