In [8]:
#Making label 
import csv

# Define input and output file paths
input_csv = 'iToL_DB.csv'
output_txt = 'iTOL_annotationfiles/iToL_labels.txt'

# Function to read CSV and generate the required format
def generate_labels_template(input_csv, output_txt):
    with open(input_csv, 'r', newline='', encoding='utf-8') as csvfile, open(output_txt, 'w', encoding='utf-8') as txtfile:
        reader = csv.DictReader(csvfile)
        
        # Write initial lines
        txtfile.write("LABELS\n")
        txtfile.write("SEPARATOR COMMA\n")
        txtfile.write("DATA\n")
        
        # Process each row in the CSV
        for row in reader:
            if row['Tree'] == 'Y':  # Filter rows where Sequence is 'Y'
                protein = row['Protein']
                gene = row['Gene']
                protein_name = row['Protein_name']
                
                # Format the line according to requirements
                line = f"{protein},{gene} {protein_name},\n"
                
                # Write the formatted line to the output file
                txtfile.write(line)

# Generate the labels_template.txt file
generate_labels_template(input_csv, output_txt)


In [10]:
import pandas as pd

# File path to the CSV
csv_file = 'iToL_DB.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file, sep=',')

# Get unique values from the 'Class' column
unique_classes = df['Class'].unique()

# Print each unique class value
for cls in unique_classes:
    print(cls)


WAKL
RKF3
Extensin
CrRLK1L-1
LRK10L-2
SD-2b
DUF26
L-LEC
CR4L
SD1
WAK
C-LEC
LRR-I
LRR-RLP
LRR-XV
LRR-II
LRR-VIII-1
LRR-III
LRR-Xb
LRR-VII
LRR-XII
LRR-IV
LRR-XI
LRR-XIV
LRR-XIIIb
LRR-VI-1
LRR-IX
LRR_XVI
LRR-VI-2
LRR-Xa
LRR-XIIIa
LRR-V
LRR-VIII-2
LysM-II
URK-1
PERK
LysM-I


In [5]:
import pandas as pd

# File path to the CSV
csv_file = 'iToL_DB.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file, sep=',')

# Function to simplify Class values based on mapping
def simplify_class(class_name):
    if class_name.startswith('LRR-I') or class_name.startswith('LRR-V') or class_name.startswith('LRR-X') or class_name.startswith('LRR_X'):
        return 'LRR-RK'
    elif class_name.startswith('LysM'):
        return 'LysM-RK'
    elif class_name.startswith('SD'):
        return 'SD-RK' 
    elif class_name.startswith('WAK'):
        return 'WAK'
    elif class_name == 'LRR-RLP':
        return 'LRR-RLP'
    elif class_name == 'CrRLK1L':
        return 'CrRLK1L'
    elif class_name == 'PERK':
        return 'PERK'
    elif class_name == 'DUF26':
        return 'CRK'
    elif class_name == 'L-LEC':
        return 'L-LEC-RK'
    elif class_name == 'C-LEC':
        return 'C-LEC-RK'
    elif class_name in class_mapping:
        return class_mapping[class_name]
    else:
        return class_name

        
# Mapping dictionary for simplifying Class values
class_mapping = {
    'RKF3': 'Etc',
    'URK-1': 'Etc',
    'Extensin': 'Etc',
    'CR4L' : 'Etc',
    'LRK10L-2' : 'Etc'
}


# Apply the simplification function to create 'Simple_Class' column
df['Simple_Class'] = df['Class'].apply(simplify_class)

# Assume df is modified with 'Simple_Class' column
df.to_csv('modified_iToL_DB.csv', sep=',', index=False)

# Print the unique values in the 'Simple_Class' column
unique_simple_classes = df['Simple_Class'].unique()
for cls in unique_simple_classes:
    print(cls)


WAK
Etc
CrRLK1L-1
SD-RK
CRK
L-LEC-RK
C-LEC-RK
LRR-RK
LRR-RLP
LysM-RK
PERK


In [8]:
import pandas as pd

# File path to the modified CSV
csv_file = 'modified_iToL_DB.csv'

# Read the modified CSV into a pandas DataFrame
df = pd.read_csv(csv_file, sep=',')

# Function to determine Binary value based on Cloning and Batch
def determine_binary(cloning, batch):
    if pd.isna(cloning):
        return -1
    elif not pd.isna(cloning) and not pd.isna(batch):
        return 1
    elif not pd.isna(cloning) and pd.isna(batch):
        return 0

# Prepare iTOL_cloning.txt content
output_lines = [
    "DATASET_BINARY",
    "SEPARATOR COMMA",
    "DATASET_LABEL,Cloning/Expression",
    "COLOR,#00441b",
    "FIELD_SHAPES,2",
    "FIELD_LABELS,f1",
    "DATA"
]

# Add data lines
for _, row in df.iterrows():
    binary_value = determine_binary(row['Cloning'], row['Batch'])
    output_lines.append(f"{row['Protein']}, {binary_value}")

# Save the output to iTOL_cloning.txt
with open('iTOL_annotationfiles/iTOL_cloning.txt', 'w') as f:
    for line in output_lines:
        f.write(line + '\n')

print(f"File 'iTOL_cloning.txt' has been created successfully.")


File 'iTOL_cloning.txt' has been created successfully.


In [7]:
import csv

# Path to input CSV file
csv_file = 'modified_iToL_DB.csv'
# Path to output TXT file
output_file = 'iTOL_annotationfiles/iTOL_Expression.txt'

# Function to read CSV and extract data
def extract_data(csv_file):
    data = []
    with open(csv_file, 'r', newline='') as file:
        reader = csv.DictReader(file, delimiter=',')  # Assuming tab-separated, adjust if comma-separated
        for row in reader:
            protein_id = row['Protein']
            expression = row['Expression']
            if expression and expression != '#N/A':  # Check if expression value exists and is not '#N/A'
                data.append(f"{protein_id}, {expression}")
    return data

# Function to write data to output file
def write_output(data, output_file):
    with open(output_file, 'w') as file:
        file.write("DATASET_SIMPLEBAR\n")
        file.write("SEPARATOR COMMA\n")
        file.write("DATASET_LABEL,Expression_Level\n")
        file.write("COLOR,#40004b\n")
        file.write("DATA\n")
        for line in data:
            file.write(f"{line}\n")

# Extract data from CSV
data = extract_data(csv_file)

# Write data to output file
write_output(data, output_file)

print(f"iTOL_Expression.txt successfully created with {len(data)} entries.")


iTOL_Expression.txt successfully created with 411 entries.


In [11]:
import pandas as pd

# File path to the modified CSV
csv_file = 'modified_iToL_DB.csv'

# Read the modified CSV into a pandas DataFrame
df = pd.read_csv(csv_file, sep=',')

# Function to determine Binary value based on Size column
def determine_binary(size):
    if pd.isna(size):
        return -1
    elif size == 'O':
        return 1
    else:
        return -1

# Prepare iTOL_size.txt content
output_lines = [
    "DATASET_BINARY",
    "SEPARATOR COMMA",
    "DATASET_LABEL, Size",
    "COLOR,#a50026",
    "FIELD_SHAPES,6",
    "FIELD_LABELS,f1",
    "DATA"
]

# Add data lines
for _, row in df.iterrows():
    binary_value = determine_binary(row['Size'])
    output_lines.append(f"{row['Protein']}, {binary_value}")

# Save the output to iTOL_size.txt
with open('iTOL_annotationfiles/iTOL_size.txt', 'w') as f:
    for line in output_lines:
        f.write(line + '\n')

print(f"File 'iTOL_size.txt' has been created successfully.")


File 'iTOL_size.txt' has been created successfully.


In [15]:
import pandas as pd

# Define color mapping dictionary
color_mapping = {
    'CrRLK1L-1': '#40004b',    
    'WAK': '#762a83',
    'SD-RK': '#9970ab',
    'PERK': '#c2a5cf',
    'L-LEC-RK': '#e7d4e8',
    'Etc': '#d9d9d9',   
    'C-LEC-RK': '#d9f0d3',    
    'LRR-RK': '#a6dba0',    
    'LRR-RLP': '#5aae61',    
    'CRK': '#1b7837',    
    'LysM-RK': '#00441b',    
}

# File path to the modified CSV
csv_file = 'modified_iToL_DB.csv'

# Read the modified CSV into a pandas DataFrame
df = pd.read_csv(csv_file)

# Prepare iTOL-range.txt content
output_lines = [
    "TREE_COLORS",
    "SEPARATOR COMMA",
    "DATA"
]

# Add data lines based on Simple_Class and color_mapping
for _, row in df.iterrows():
    simple_class = row['Simple_Class']
    protein = row['Protein']
    if simple_class in color_mapping:
        color = color_mapping[simple_class]
        output_lines.append(f"{protein},range,{color},{simple_class}")

# Save the output to iTOL-range.txt
with open('iTOL_annotationfiles/iTOL-range.txt', 'w') as f:
    for line in output_lines:
        f.write(line + '\n')

print("File 'iTOL-range.txt' has been created successfully.")


File 'iTOL-range.txt' has been created successfully.


In [15]:
import pandas as pd

# File path to the modified CSV
csv_file = 'modified_iToL_DB.csv'

# Read the modified CSV into a pandas DataFrame
df = pd.read_csv(csv_file, sep=',')

# Function to determine Binary value based on Size column
def determine_binary(Glycan_array):
    if Glycan_array == 'O':
        return 1
    elif Glycan_array == 'X':
        return 0
    else:
        return -1

# Prepare iTOL_Glycan_array.txt content
output_lines = [
    "DATASET_BINARY",
    "SEPARATOR COMMA",
    "DATASET_LABEL, Glycan_array",
    "COLOR,#40004b",
    "FIELD_SHAPES,2",
    "FIELD_LABELS,f1",
    "DATA"
]

# Add data lines
for _, row in df.iterrows():
    binary_value = determine_binary(row['Glycan_array'])
    output_lines.append(f"{row['Protein']}, {binary_value}")

# Save the output to iTOL_Glycan_array.txt
with open('iTOL_annotationfiles/iTOL_Glycan_array.txt', 'w') as f:
    for line in output_lines:
        f.write(line + '\n')

print(f"File 'iTOL_Glycan_array.txt' has been created successfully.")


File 'iTOL_Glycan_array.txt' has been created successfully.


In [3]:
import pandas as pd

# Load the CSV file
file_path = 'modified_iToL_DB.csv'
df = pd.read_csv(file_path)

# Count the rows where 'Cloning' is not null
cloning_count = df['Cloning'].notna().sum()

# Count the rows where 'Batch' is not null
batch_count = df['Batch'].notna().sum()

# Count the rows where 'Size' is 'O'
size_o_count = df[df['Size'] == 'O'].shape[0]

# Count the rows where 'Glycan_array' is 'O'
glycan_array_o_count = df[df['Glycan_array'] == 'O'].shape[0]

print(f"The number of rows where 'Cloning' is not null is: {cloning_count}")
print(f"The number of rows where 'Batch' is not null is: {batch_count}")
print(f"The number of rows where 'Size' is 'O' is: {size_o_count}")
print(f"The number of rows where 'Glycan_array' is 'O' is: {glycan_array_o_count}")

The number of rows where 'Cloning' is not null is: 442
The number of rows where 'Batch' is not null is: 409
The number of rows where 'Size' is 'O' is: 340
The number of rows where 'Glycan_array' is 'O' is: 357


In [11]:
import pandas as pd

# File path to the modified CSV
csv_file = 'modified_iToL_DB.csv'

# Read the modified CSV into a pandas DataFrame
df = pd.read_csv(csv_file, sep=',')

# Function to determine Binary value based on Size column
def determine_binary(ARM):
    if ARM == 'O':
        return 1
    elif ARM == 'X':
        return -1
    else:
        return -1

# Prepare iTOL_ARM.txt content
output_lines = [
    "DATASET_BINARY",
    "SEPARATOR COMMA",
    "DATASET_LABEL, ARM",
    "COLOR,#a50026",
    "FIELD_SHAPES,5",
    "FIELD_LABELS,f1",
    "DATA"
]

# Add data lines
for _, row in df.iterrows():
    binary_value = determine_binary(row['ARM'])
    output_lines.append(f"{row['Protein']}, {binary_value}")

# Save the output to iTOL_ARM.txt
with open('iTOL_annotationfiles/iTOL_ARM.txt', 'w') as f:
    for line in output_lines:
        f.write(line + '\n')

print(f"File 'iTOL_ARM.txt' has been created successfully.")


File 'iTOL_ARM.txt' has been created successfully.
