In [1]:
import pandas as pd
import numpy as np

# Load the main dataset
train_x = pd.read_csv('train_x.csv')

# Load the additional scale datasets
st_scale = pd.read_csv('ST-scale.csv', skiprows=2)
t_scale = pd.read_csv('T-scale.csv', skiprows=2)
z_scale = pd.read_csv('Z-scale.csv', skiprows=2)
dpps = pd.read_csv("DPPS.csv", skiprows=2)
whim = pd.read_csv("MS-WHIM.csv", skiprows=2)
phys = pd.read_csv("Physical.csv", skiprows=2)
vhse = pd.read_csv("VHSE-scale.csv", skiprows=2)

In [2]:
# Create mapping dictionaries for each scale
st_dict = st_scale.set_index('AA_1').to_dict('index')#
t_dict = t_scale.set_index('AA_1').to_dict('index')#
z_dict = z_scale.set_index('AA_1').to_dict('index')#
dpps_dict = dpps.set_index('AA_1').to_dict('index')#
whim_dict = whim.set_index('AA_1').to_dict('index')#
phys_dict = phys.set_index('AA_1').to_dict('index')
vhse_dict = vhse.set_index('AA_1').to_dict('index')#

In [3]:
def enrich_sequence(sequence):
    enriched_sequence = []
    for aa in sequence:
        properties = {}
        if aa in st_dict:
            properties.update({f'ST{i}': st_dict[aa][f'ST{i}'] for i in range(1, 9)})
        if aa in t_dict:
            properties.update({f'T{i}': t_dict[aa][f'T{i}'] for i in range(1, 6)})
        if aa in z_dict:
            properties.update({f'Z{i}': z_dict[aa][f'Z({i})'] for i in range(1, 4)})
        if aa in dpps_dict:
            properties.update({f'D{i}': dpps_dict[aa][f'D{i}'] for i in range(1, 11)})
        if aa in vhse_dict:
            properties.update({f'VHSE{i}': vhse_dict[aa][f'VHSE{i}'] for i in range(1, 9)})
        if aa in whim_dict:
            properties.update({f'WHIM{i}': whim_dict[aa][f'{i}'] for i in range(1,4)})
        if aa in phys_dict:
            properties.update({f'{i}': phys_dict[aa][f'{i}'] for i in ["Vol", "Hydro"]})
        enriched_sequence.append((aa, properties))
    return enriched_sequence

In [4]:
# Apply the enrichment function to the 'ConstructedAASeq_cln' column
train_x['EnrichedSequence'] = train_x['ConstructedAASeq_cln'].apply(enrich_sequence)

In [None]:
# Create new columns for each property
all_properties = ['ST1', 'ST2', 'ST3', 'ST4', 'ST5', 'ST6', 'ST7', 'ST8', 
                  'T1', 'T2', 'T3', 'T4', 'T5', 
                  'Z1', 'Z2', 'Z3',
                 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10',
                 'VHSE1', 'VHSE2', 'VHSE3', 'VHSE4', 'VHSE5', 'VHSE6', 'VHSE7', 'VHSE8',
                'WHIM1', 'WHIM2', 'WHIM3','Vol', 'Hydro']
for prop in all_properties:
    train_x[f'Mean_{prop}'] = train_x['EnrichedSequence'].apply(lambda seq: np.mean([float(aa[1].get(prop, 0)) for aa in seq]))
    train_x[f'Std_{prop}'] = train_x['EnrichedSequence'].apply(lambda seq: np.std([float(aa[1].get(prop, 0)) for aa in seq]))
    print(prop)
# Remove the temporary 'EnrichedSequence' column
train_x = train_x.drop('EnrichedSequence', axis=1)

ST1
ST2
ST3
ST4
ST5
ST6
ST7
ST8
T1
T2
T3
T4
T5
Z1
Z2
Z3
D1
D2
D3
D4
D5
D6
D7
D8
D9
D10
VHSE1


In [None]:
train_y = pd.read_csv("train_y.csv")
train_y.head()

In [None]:
# Save the enriched dataset
train_x = train_x.drop('Unnamed: 0', axis = 1)
train_x = train_x.drop('ConstructedAASeq_cln', axis = 1)
train_x = train_x.drop('Id', axis = 1)
Y = train_y.Brightness_Class
train_x.info()
train_x.to_csv('enriched_train_x.csv', index=False)
print("Dataset enrichment complete. New file saved as 'enriched_train_x.csv'")

In [None]:
train_x.head()

In [None]:
whim_dict

In [None]:
only1 = []
for k in train_x.columns:
    if train_x[k].nunique() != train_x.shape[0]:
        only1.append(train_x[k].nunique())        

In [None]:
for i in train_x.columns:
    if train_x[i].isna().sum()!=0:
        print(i,train_x[i].isna().sum())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
trainn_x = s.fit_transform(train_x)
X_train, X_test, y_train, y_test = train_test_split(trainn_x, Y,test_size=0.3, random_state=42)

In [None]:
logr = LogisticRegression()
logr.fit(X_train, y_train)
predict = logr.predict(X_test)
logr.score(X_test, y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
decision_tree = DecisionTreeClassifier()
# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=decision_tree, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Get the best score
best_score = grid_search.best_score_
print("Best cross-validation score:", best_score)

# Use the best model to make predictions on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test set score:", test_score)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Compute the correlation matrix
corr = train_x.iloc[:, 1:16].corr()

# Generate a mask for the upper triangle

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(130, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(train_x.iloc[:, 1:16].corr(), vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True)

In [None]:
import pickle

# save
with open('model.pkl','wb') as f:
    pickle.dump(best_model,f)

In [None]:
test_x.to_csv("Enriched_test_x.csv", index = False)
test_x.info()
id = test_x.Id

In [None]:
test_x = test_x.drop('ConstructedAASeq_cln', axis = 1)
test_x = test_x.drop('Id', axis = 1)

In [None]:
s = StandardScaler()
test_x = s.fit_transform(test_x)
logr.predict(test_x)

In [None]:
y_pred = best_model.predict(test_x)

In [None]:
a = {"Id": id, "Brightness_Class":y_pred}

In [None]:
df = pd.DataFrame(a)

In [None]:
df.to_csv("Submission.csv", index=False)