In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
train_X = pd.read_csv("C:/Users/Anorm/Downloads/train_X.csv", index_col=0)
train_y = pd.read_csv("C:/Users/Anorm/Downloads/train_y (1).csv")
test_X = pd.read_csv("C:/Users/Anorm/Downloads/test_X.csv")
ms_whim = pd.read_csv("C:/Users/Anorm/Downloads/MS-WHIM.csv")
dpps = pd.read_csv("C:/Users/Anorm/Downloads/DPPS.csv")
phys = pd.read_csv("C:/Users/Anorm/Downloads/Physical.csv")
st_scale = pd.read_csv("C:/Users/Anorm/Downloads/ST-scale.csv")
t_scale = pd.read_csv("C:/Users/Anorm/Downloads/T-scale.csv")
vhse_scale = pd.read_csv("C:/Users/Anorm/Downloads/VHSE-scale.csv")
z_scale = pd.read_csv("C:/Users/Anorm/Downloads/Z-scale.csv")

In [3]:
X_train = train_X.iloc[:,0]
y = train_y.iloc[:, 1]
X_test = test_X.drop(test_X.columns[0], axis=1)
print(train_X.shape,y.shape,X_test.shape)
test_Xs = test_X.drop(test_X.columns[1],axis=1)

(31029, 2) (31029,) (20686, 1)


In [4]:
def create_aa_encoder():
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    aa_encoder = OneHotEncoder(sparse_output=False)
    aa_encoder.fit(np.array(list(amino_acids)).reshape(-1, 1))
    return aa_encoder

aa_encoder = create_aa_encoder()

def encode_sequence(sequence, max_length=None):
    sequence = sequence[:max_length] if max_length else sequence
    encoded = aa_encoder.transform(np.array(list(sequence)).reshape(-1, 1))
    return encoded.flatten()

In [5]:
# Determine the maximum sequence length in X_train
max_length = X_train.str.len().max()

print(f"Maximum sequence length: {max_length}")

# Encode all sequences
X_train_encoded = np.array([encode_sequence(seq, max_length) for seq in X_train])

print("Shape of X_train_encoded:", X_train_encoded.shape)

Maximum sequence length: 237
Shape of X_train_encoded: (31029, 4740)


In [6]:
# Encode test sequences
X_test_encoded = np.array([encode_sequence(seq, max_length) for seq in test_Xs['ConstructedAASeq_cln']])

print("Shape of X_test_combined:", X_test_encoded.shape)

Shape of X_test_combined: (20686, 4740)


In [7]:
# Split the data
X_train_final, X_val, y_train, y_val = train_test_split(X_train_encoded, y, test_size=0.2, random_state=42)

# Train the model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set
test_predictions = rf_classifier.predict(X_test_encoded)

# Create a DataFrame with the predictions
results = pd.DataFrame({
    'Id': test_X['Id'],
    'Brightness': test_predictions
})

# Save the results to a CSV file
results.to_csv('predictions.csv', index=False)

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import GridSearchCV


param_grid = {'solver': ['liblinear'], 'penalty': ['l1'], 'C': [.001,0.01,0.1,1,10,100],'max_iter':[100,500,1000,10000]}


L1_Logistic = LogisticRegression()

grid_search_l1= GridSearchCV(L1_Logistic, param_grid, cv=5,scoring='f1_macro')
grid_search_l1.fit(X_train_encoded, y)


print('Best Optimal hyperparameter combination:', grid_search_l1.best_params_, 'F1 score:', round(grid_search_l1.best_score_, 4))
best_l1_model = grid_search_l1.best_estimator_

In [47]:
# Split the data
X_train_final, X_val, y_train, y_val = train_test_split(X_train_encoded, y, test_size=0.2, random_state=42)
# Train the Logistic Regression model
print("Training the Logistic Regression model...")
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = log_reg.predict(X_val)

# Evaluate the model
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the Logistic Regression model...
Validation Accuracy: 0.8919

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.89      0.91      3777
           1       0.84      0.89      0.87      2429

    accuracy                           0.89      6206
   macro avg       0.88      0.89      0.89      6206
weighted avg       0.89      0.89      0.89      6206



In [49]:
test_predictions = log_reg.predict(X_test_encoded)

# Create a DataFrame with the predictions
results = pd.DataFrame({
    'Id': test_X['Id'],
    'Brightness': test_predictions
})

# Save the results to a CSV file
results.to_csv('predictions.csv', index=False)

In [8]:
def clean_descriptor_df(df):
    df = df.iloc[2:].reset_index(drop=True)
    df.set_index(df.columns[1], inplace=True)
    df = df.drop(df.columns[0], axis=1)
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    return df


In [21]:
descriptor = clean_descriptor_df(z_scale)

def sequence_to_features(sequence, descriptor):
    features = []
    for aa in sequence:
        if aa in descriptor.index:
            features.extend(descriptor.loc[aa].values)
        else:
            features.extend([0] * len(descriptor.columns))
    return np.array(features)

In [22]:
# Encode training data
print("Encoding training data...")
X_train_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(X_train)])

Encoding training data...


100%|██████████| 31029/31029 [02:04<00:00, 249.14it/s]


In [23]:
# Encode test data
print("Encoding test data...")
X_test_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(test_Xs['ConstructedAASeq_cln'])])

Encoding test data...


100%|██████████| 20686/20686 [01:29<00:00, 231.15it/s]


In [24]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

In [25]:
# Split the data into training and validation sets
X_train_final, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.2, random_state=42)

# Train the model
print("Training the model...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the model...
Validation Accuracy: 0.8493393490170802

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.85      6206
weighted avg       0.86      0.85      0.85      6206



In [26]:
# Make predictions on the test set
print("Making predictions on test data...")
test_predictions = rf_classifier.predict(X_test_scaled)

# Append predictions to X_test
X_test['Brightness'] = test_predictions

# Create a DataFrame with the predictions
results = pd.DataFrame({
    'Id': X_test['Id'],
    'Brightness': X_test['Brightness']
})

# Save the results to a CSV file
results.to_csv('predictions.csv', index=False)
print("Predictions saved to 'predictions.csv'")

Making predictions on test data...
Predictions saved to 'predictions.csv'


In [None]:
#DPPS good-08
#MS-Whim good-98
#Physical -2018
#St-scale -2009 - competent
#T-scale -2007 - ok
#VHSe -2005 -ok
#Z-scale -1987



#Combine strongest descriptors in each set and use those

In [9]:
descriptor = clean_descriptor_df(ms_whim)

def sequence_to_features(sequence, descriptor):
    features = []
    for aa in sequence:
        if aa in descriptor.index:
            features.extend(descriptor.loc[aa].values)
        else:
            features.extend([0] * len(descriptor.columns))
    return np.array(features)

In [10]:
# Encode training data
print("Encoding training data...")
X_train_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(X_train)])

Encoding training data...


100%|██████████| 31029/31029 [03:04<00:00, 168.20it/s]


In [11]:
# Encode test data
print("Encoding test data...")
X_test_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(test_Xs['ConstructedAASeq_cln'])])

Encoding test data...


100%|██████████| 20686/20686 [02:00<00:00, 172.18it/s]


In [12]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

In [13]:
# Split the data into training and validation sets
X_train_final, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.2, random_state=42)

# Train the model
print("Training the model...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the model...
Validation Accuracy: 0.8470834676119884

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.85      0.85      0.85      6206



In [14]:
descriptor = clean_descriptor_df(dpps)

def sequence_to_features(sequence, descriptor):
    features = []
    for aa in sequence:
        if aa in descriptor.index:
            features.extend(descriptor.loc[aa].values)
        else:
            features.extend([0] * len(descriptor.columns))
    return np.array(features)

In [15]:
# Encode training data
print("Encoding training data...")
X_train_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(X_train)])

Encoding training data...


100%|██████████| 31029/31029 [04:37<00:00, 111.92it/s]


In [16]:
# Encode test data
print("Encoding test data...")
X_test_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(test_Xs['ConstructedAASeq_cln'])])

Encoding test data...


100%|██████████| 20686/20686 [02:35<00:00, 133.17it/s]


In [17]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

In [18]:
# Split the data into training and validation sets
X_train_final, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.2, random_state=42)

# Train the model
print("Training the model...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the model...
Validation Accuracy: 0.8464389300676765

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.85      0.85      0.85      6206



In [19]:
descriptor = clean_descriptor_df(phys)

def sequence_to_features(sequence, descriptor):
    features = []
    for aa in sequence:
        if aa in descriptor.index:
            features.extend(descriptor.loc[aa].values)
        else:
            features.extend([0] * len(descriptor.columns))
    return np.array(features)

In [20]:
# Encode training data
print("Encoding training data...")
X_train_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(X_train)])

Encoding training data...


100%|██████████| 31029/31029 [02:59<00:00, 172.43it/s]


In [21]:
# Encode test data
print("Encoding test data...")
X_test_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(test_Xs['ConstructedAASeq_cln'])])

Encoding test data...


100%|██████████| 20686/20686 [01:59<00:00, 173.32it/s]


In [22]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

In [23]:
# Split the data into training and validation sets
X_train_final, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.2, random_state=42)

# Train the model
print("Training the model...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the model...
Validation Accuracy: 0.8477280051563003

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.86      0.85      0.85      6206



In [24]:
descriptor = clean_descriptor_df(st_scale)

def sequence_to_features(sequence, descriptor):
    features = []
    for aa in sequence:
        if aa in descriptor.index:
            features.extend(descriptor.loc[aa].values)
        else:
            features.extend([0] * len(descriptor.columns))
    return np.array(features)

In [25]:
# Encode training data
print("Encoding training data...")
X_train_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(X_train)])

Encoding training data...


100%|██████████| 31029/31029 [03:51<00:00, 133.99it/s]


In [26]:
# Encode test data
print("Encoding test data...")
X_test_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(test_Xs['ConstructedAASeq_cln'])])

Encoding test data...


100%|██████████| 20686/20686 [02:30<00:00, 137.62it/s]


In [27]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

In [28]:
# Split the data into training and validation sets
X_train_final, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.2, random_state=42)

# Train the model
print("Training the model...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the model...
Validation Accuracy: 0.8445053174347406

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.81      2429

    accuracy                           0.84      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.85      0.84      0.85      6206



In [29]:
descriptor = clean_descriptor_df(t_scale)

def sequence_to_features(sequence, descriptor):
    features = []
    for aa in sequence:
        if aa in descriptor.index:
            features.extend(descriptor.loc[aa].values)
        else:
            features.extend([0] * len(descriptor.columns))
    return np.array(features)

In [30]:
# Encode training data
print("Encoding training data...")
X_train_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(X_train)])

Encoding training data...


100%|██████████| 31029/31029 [03:19<00:00, 155.41it/s]


In [31]:
# Encode test data
print("Encoding test data...")
X_test_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(test_Xs['ConstructedAASeq_cln'])])

Encoding test data...


100%|██████████| 20686/20686 [03:08<00:00, 109.90it/s]


In [32]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

In [33]:
# Split the data into training and validation sets
X_train_final, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.2, random_state=42)

# Train the model
print("Training the model...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the model...
Validation Accuracy: 0.8466000644537545

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.85      0.85      0.85      6206



In [52]:
descriptor = clean_descriptor_df(vhse_scale)

def sequence_to_features(sequence, descriptor):
    features = []
    for aa in sequence:
        if aa in descriptor.index:
            features.extend(descriptor.loc[aa].values)
        else:
            features.extend([0] * len(descriptor.columns))
    return np.array(features)

In [53]:
# Encode training data
print("Encoding training data...")
X_train_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(X_train)])

Encoding training data...


100%|██████████| 31029/31029 [03:38<00:00, 142.21it/s]


In [54]:
# Encode test data
print("Encoding test data...")
X_test_features = np.array([sequence_to_features(seq, descriptor) for seq in tqdm(test_Xs['ConstructedAASeq_cln'])])

Encoding test data...


100%|██████████| 20686/20686 [02:26<00:00, 140.75it/s]


In [55]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_test_scaled = scaler.transform(X_test_features)

In [38]:
# Split the data into training and validation sets
X_train_final, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.2, random_state=42)

# Train the model
print("Training the model...")
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = rf_classifier.predict(X_val)

# Evaluate the model
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the model...
Validation Accuracy: 0.8486948114727683

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3777
           1       0.77      0.87      0.82      2429

    accuracy                           0.85      6206
   macro avg       0.84      0.85      0.84      6206
weighted avg       0.86      0.85      0.85      6206



In [56]:
# Split the data into training and validation sets
X_train_final, X_val, y_train, y_val = train_test_split(X_train_scaled, y, test_size=0.2, random_state=42)
# Train the Logistic Regression model
print("Training the Logistic Regression model...")
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_final, y_train)

# Make predictions on the validation set
y_val_pred = log_reg.predict(X_val)

# Evaluate the model
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Training the Logistic Regression model...
Validation Accuracy: 0.8930

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      3777
           1       0.86      0.87      0.86      2429

    accuracy                           0.89      6206
   macro avg       0.89      0.89      0.89      6206
weighted avg       0.89      0.89      0.89      6206



In [57]:
test_predictions = log_reg.predict(X_test_scaled)

# Create a DataFrame with the predictions
results = pd.DataFrame({
    'Id': test_X['Id'],
    'Brightness': test_predictions
})

# Save the results to a CSV file
results.to_csv('predictions.csv', index=False)