In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/kaggle/input/nexus-by-djs-nsdc-ultraceuticals/train.csv')
df.columns

Index(['ID', 'TargetID', 'DRUGID', 'DRUGTYPE', 'Drug_high_status', 'DRUGNAME',
       'PUBCHCID', 'Disease_of_highest_status', 'Drug_Status', 'UNIPROID',
       'TARGNAME', 'GENENAME', 'SYNONYMS', 'FUNCTION', 'BIOCLASS', 'SEQUENCE',
       'Disease', 'Accession Number', 'Target_Status'],
      dtype='object')

In [3]:
columns = ['DRUGTYPE', 'Drug_high_status', 'DRUGNAME',
           'Disease_of_highest_status', 'Drug_Status','TARGNAME','GENENAME' ,'SYNONYMS', 'FUNCTION', 'BIOCLASS','Disease']
target = 'Target_Status'

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_y = LabelEncoder()
for col in columns:
    df[col] = le.fit_transform(df[col])
df[target] = le_y.fit_transform(df[target])

In [5]:
df = df.drop(columns = ['ID', 'TargetID', 'DRUGID','PUBCHCID','UNIPROID'], axis = 1)

In [6]:
from sklearn.feature_selection import SelectFromModel
!pip install biopython propy3 numpy pandas



In [7]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

def calculate_amino_acid_composition(sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    return analyzed_seq.get_amino_acids_percent()

# Molecular Weight
def calculate_molecular_weight(sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    return analyzed_seq.molecular_weight()

def calculate_hydrophobicity(sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    return analyzed_seq.gravy()

def calculate_secondary_structure(sequence):
    analyzed_seq = ProteinAnalysis(sequence)
    return analyzed_seq.secondary_structure_fraction()   

def extract_protein_features(sequence):
    features = {}

    amino_acid_comp = calculate_amino_acid_composition(sequence)
    for aa, freq in amino_acid_comp.items():
        features[f"aa_{aa}_composition"] = freq

    features['molecular_weight'] = calculate_molecular_weight(sequence)
    features['hydrophobicity'] = calculate_hydrophobicity(sequence)

    helix, turn, sheet = calculate_secondary_structure(sequence)
    features['helix_fraction'] = helix
    features['turn_fraction'] = turn
    features['sheet_fraction'] = sheet

    return features

df_features = df['SEQUENCE'].apply(lambda seq: extract_protein_features(seq))
df_features = pd.json_normalize(df_features)  
df_combined = pd.concat([df, df_features], axis=1)


In [8]:
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

def calculate_amino_acid_composition(sequence):
    """Calculates the composition of amino acids in a peptide."""
    counts = Counter(sequence)
    composition = {aa: counts[aa] / len(sequence) for aa in counts}
    return composition

def create_peptide_dataframe(peptide_sequences):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"  


    all_data = []
    for sequence in peptide_sequences:
        peptide_data = {"Sequence": sequence}

        for i in range(min(5, len(sequence))):
            peptide_data.update({
                f"{aa}{i+1}": int(sequence[i] == aa) 
                for aa in amino_acids
            })

        peptide_data.update(calculate_amino_acid_composition(sequence))

        peptide_data["Length"] = len(sequence)

        all_data.append(peptide_data)

    df = pd.DataFrame(all_data)
    return df

peptide_list = df["SEQUENCE"][::] * 10 
df_encoded = create_peptide_dataframe(peptide_list)
df_encoded  = df_encoded.drop(columns = ['Sequence'], axis = 1)

In [9]:
df_combined = df_combined.drop(columns = ['SEQUENCE', 'Accession Number','Target_Status'], axis = 1)

In [10]:
X = df_combined
y = df['Target_Status']

In [11]:
from sklearn.model_selection import train_test_split,RandomizedSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify =  y)

In [12]:
X_train.head()

Unnamed: 0,DRUGTYPE,Drug_high_status,DRUGNAME,Disease_of_highest_status,Drug_Status,TARGNAME,GENENAME,SYNONYMS,FUNCTION,BIOCLASS,...,aa_S_composition,aa_T_composition,aa_V_composition,aa_W_composition,aa_Y_composition,molecular_weight,hydrophobicity,helix_fraction,turn_fraction,sheet_fraction
1403,4,0,2361,164,1,389,386,165,114,19,...,0.038462,0.067308,0.076923,0.021154,0.044231,58762.2063,-0.235769,0.336538,0.246154,0.390385
132333,4,14,763,441,15,440,475,381,473,45,...,0.070175,0.081871,0.076023,0.011696,0.035088,39438.3097,0.35117,0.260234,0.239766,0.502924
71179,4,0,1355,270,1,280,271,104,482,45,...,0.079235,0.060109,0.101093,0.027322,0.027322,41328.012,0.469126,0.303279,0.243169,0.478142
46377,4,17,87,592,17,320,351,126,57,58,...,0.075977,0.049928,0.062952,0.014472,0.036903,156331.1466,-0.359334,0.293054,0.304631,0.353111
11059,4,14,154,182,15,684,363,638,127,101,...,0.08887,0.050906,0.057808,0.010354,0.024159,126653.4853,-0.172821,0.291631,0.329594,0.339948


In [13]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as rfc

In [14]:
XGB_model = XGBClassifier()
XGB_model.fit(X_train, y_train)

In [15]:
y_predictions = XGB_model.predict(X_test)
print(y_predictions)

[ 1 26 26 ...  1  1  1]


In [16]:
from sklearn.metrics import f1_score
f1_score(y_test, y_predictions, average='weighted')

0.9670023853834782

In [19]:
df_test = pd.read_csv('/kaggle/input/nexus-by-djs-nsdc-ultraceuticals/test.csv')
df_ID = df_test[['ID']]
df_test = df_test.drop(columns = ['ID', 'TargetID', 'DRUGID','PUBCHCID','UNIPROID'], axis = 1)
for col in columns:
    df_test[col] = le.fit_transform(df_test[col])

In [20]:
df_test_features = df_test['SEQUENCE'].apply(lambda seq: extract_protein_features(seq))
df_test_features = pd.json_normalize(df_test_features)  
df_test_combined = pd.concat([df_test, df_test_features], axis=1)

In [23]:
peptide_list = df_test["SEQUENCE"][::] * 10 
df_test_encoded = create_peptide_dataframe(peptide_list)
df_test_encoded  = df_test_encoded.drop(columns = ['Sequence'], axis = 1)

In [24]:
df_test_combined = df_test_combined.drop(columns = ['SEQUENCE', 'Accession Number'], axis = 1)

In [25]:
df_test_combined = pd.concat([df_test_combined, df_test_encoded], axis = 1)

In [27]:
training_features = XGB_model.get_booster().feature_names 
df_test_combined = df_test_combined[training_features]

In [30]:
y_test_predictions = XGB_model.predict(df_test_combined)
print(y_test_predictions)

[ 1 26 26 ...  1 17  1]


In [31]:
y_test_predictions = le_y.inverse_transform(y_test_predictions)
print(y_test_predictions)

['Approved' 'Phase 3' 'Phase 3' ... 'Approved' 'Phase 1' 'Approved']


In [32]:
output = pd.DataFrame({'ID': df_ID.ID,
                       'Prediction': y_test_predictions})

In [None]:
output.to_csv('Nexus_submission_9.csv', index=False)