In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colors
from tqdm import tqdm
import glob
import sklearn
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import tensorflow as tf
import keras
from keras.layers import Input, Dense, LeakyReLU
from keras.models import Model, Sequential, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping

import os
import gzip
import sys

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

# LCC data import


In [None]:
# Import lcc data files for wt protein and assign to variable (wt_windowsize)
wt_files=glob.glob('w*.lccdata')
wt_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))  # Sort files based on numerical part

window_range = list(range(2,51))
wt_dict = {}
for window, file in zip(window_range, wt_files):
    wt_dict[window] = pd.DataFrame(np.loadtxt(file)).drop(columns=0) 

# Check the correspondence
for window, file in wt_dict.items():
    print(f"Window {window} corresponds to {wt_files[window - 2]}")    

In [None]:
# Import lcc data files for mutant protein and assign to variable (mutant_windowsize)

# Import lcc data files for mutant protein and assign to variable (mutant_windowsize)
m_files = glob.glob('m*.lccdata')
m_files.sort(key=lambda x: [int(part) if part.isdigit() else part for part in re.split(r'(\d+)', x)])

window_range = list(range(2, 51))
D132H_dict = {}
for window, file in zip(window_range, m_files):
    D132H_dict[window] = pd.DataFrame(np.loadtxt(file)).drop(columns=0)

# Check the correspondence
for window, file in D132H_dict.items():
    print(f"Window {window} corresponds to {m_files[window - 2]}")


In [None]:
# Visualization of dataset
print('WT for window size = 5')
display(wt_dict[5])

print('\n')
print('---------------------------------')
print('D132H for window size = 12')
display(D132H_dict[12])

# Local compaction plots, ws = 2-50

In [None]:
# Plot distance measurements at each position
def LCC_plot(window, wt, mutant, ax):
    '''
    Creates LCC plot for wt and mutant data for different window sizes
    '''
    
    wt = wt.to_numpy()
    mutant = mutant.to_numpy()
    
    frame_number_wt = wt.shape[0]
    frame_number_mutant = mutant.shape[0]
    
    #print('Number of trajectory frames read in (WT): ', frame_number_wt, '\n',
    #      'Number of trajectory frames read in (mutant): ', frame_number_mutant)
    
    if frame_number_wt == frame_number_mutant:
        frame_number = frame_number_wt
        
    else:
        print('Different number of trajectory frames read in for mutant and WT')
        
    upper_limit = 70 + 1 - window # max protein length + 1


    for z in range(1, frame_number, 10):
        y = wt[z]
        k = mutant[z]

    # Calculate the length of y (assuming y represents amino acid sequence positions)
        y_length = len(y)

    # Generate x with the same length as y
        x = np.linspace(1 + window / 2 + 90, upper_limit + window / 2 + 90, y_length)

        lcc = ax.plot(x, y, color='blue', alpha=0.002)
        lcc = ax.plot(x, k, color='red', alpha=0.002)

        
        
    ax.set_xlabel('Amino Acid Sequence Position')  
    ax.set_ylabel('Distance ($\AA$)') 
    ax.xaxis.set_tick_params(which='both', labelbottom=True)
    plot_name = 'Sequence Distance Distribution: Window Size ' + str(window) + ' aa'
    ax.set_title(plot_name)
    
    
    return lcc


plt.rcParams['font.size'] = '18'
fig, axes = plt.subplots(nrows = 49, ncols = 1, figsize = (15,490))

for window in tqdm(range(2,51)):
    
    LCC_plot(window, wt_dict[window], D132H_dict[window], (axes.flat)[window-2])
    

plt.show()