In [2]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.svm import SVC # type: ignore
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # type: ignore
import time


In [3]:
def preprocess_data(files):
    train_dataframes = []
    test_dataframes = []
    for file in files:
        with open(file, 'r') as f:
            lines = f.readlines()
            sequences = lines[1::3]  # Sélectionner les lignes des séquences
            annotations = lines[2::3]  # Sélectionner les lignes des annotations
            df = pd.DataFrame({'Sequence': sequences, 'Annotation': annotations})
            # Diviser les données en ensembles d'entraînement et de test
            train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
            train_dataframes.append(train_df)
            test_dataframes.append(test_df)
    train_data = pd.concat(train_dataframes)
    test_data = pd.concat(test_dataframes)
    return train_data, test_data

def extract_test_data(test_data, num_files):
    test_dataframes = []
    start_index = 0
    for _ in range(num_files):
        end_index = start_index + len(test_data) // num_files
        test_dataframes.append(test_data.iloc[start_index:end_index])
        start_index = end_index
    return test_dataframes


# Encodage one-hot des séquences
def one_hot_encoding(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    one_hot = np.zeros((len(sequence), len(amino_acids)))
    for i, aa in enumerate(sequence):
        if aa in amino_acids:
            one_hot[i, amino_acids.index(aa)] = 1
    return one_hot.flatten()

# Extraction des caractéristiques
def extract_features(df, p=13, q=2):
    features = []
    labels = []
    for index, row in df.iterrows():
        sequence = row['Sequence']
        annotation = row['Annotation']
        for i in range(len(sequence) - p - q + 1):
            neighborhood = sequence[i:i+p+q]
            encoded_neighborhood = one_hot_encoding(neighborhood)
            label = 1 if 'C' in annotation[i:i+p+q] else 0
            features.append(encoded_neighborhood)
            labels.append(label)
    return features, labels


In [4]:

# Entraînement et évaluation
def train_and_evaluate(X_train, X_test, y_train, y_test, kernel='rbf'):
    svm = SVC(kernel=kernel)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1


In [5]:
# Optimisation des paramètres
def optimize_parameters( df, kernels, p_values, q_values):
    best_accuracy = 0
    best_params = {'kernel': None, 'p': None, 'q': None}
    
    for p in p_values:
        for q in q_values:
            
            features, labels = extract_features(df, p, q)
            X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=42)
            for kernel in kernels:
                accuracy, _, _, _= train_and_evaluate(X_train, X_test, y_train, y_test, kernel)
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = {'kernel': kernel, 'p': p, 'q': q}
          
        

    return best_params, best_accuracy

In [6]:
# Chemins vers les fichiers .red
files =  ["/home/ouattara_aboubakar/PROJET 442/p2-protein-cleavage/data/EUKSIG_13.red", "/home/ouattara_aboubakar/PROJET 442/p2-protein-cleavage/data/GRAM-SIG_13.red", "/home/ouattara_aboubakar/PROJET 442/p2-protein-cleavage/data/GRAM+SIG_13.red"]

# Prétraitement des données
df_train, df_test = preprocess_data(files)

num_files = len(files)  # Nombre de fichiers
test_dataframes = extract_test_data(df_test, num_files)

# Valeurs de p et q à tester
p_plage = range(7,18, 2)
q_plage = range(2,7, 1)

# Noyaux SVM à tester
kernels = ['linear', 'poly', 'rbf']

In [7]:
# Optimisation des paramètres

best_params, best_accuracy = optimize_parameters(df_train, kernels, p_values=p_plage, q_values=q_plage)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
print("Meilleur kernel:", best_params['kernel'])
print("Meilleur p:", best_params['p'])
print("Meilleur q:", best_params['q'])
print("Meilleure précision:", best_accuracy)

Meilleur kernel: poly
Meilleur p: 7
Meilleur q: 2
Meilleure précision: 0.8383437322745321


In [9]:
def predict_single_site(sequence, model, p, q):
    site = None
    for i in range(len(sequence) - p - q + 1):
        neighborhood = sequence[i:i+p+q]
        encoded_neighborhood = one_hot_encoding(neighborhood)
        prediction = model.predict([encoded_neighborhood])[0]
        if prediction == 1:
            site = i + p + 2  # Ajouter le décalage pour obtenir la position du site de clivage
            break  # Arrêter dès qu'un site de clivage est trouvé
    return site

In [10]:
X, y = extract_features(df_train, best_params['p'], best_params['q'])

In [34]:
best_kernel =  best_params['kernel'] # Choisissez le meilleur noyau en fonction des résultats précédents
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
best_model = SVC(kernel=best_kernel)
start_time = time.time()
best_model.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time

y_pred = best_model .predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print('time = ', training_time)
print("accuracy train", accuracy)
print("precision", precision)
print("recall", recall)
print("f1", f1)


time =  0.9442927837371826
accuracy train 0.8612593383137673
precision 0.7857142857142857
recall 0.2716049382716049
f1 0.4036697247706422


In [35]:
file_names = ["Euk", "Gneg", "Gpos"] 
# Boucle sur les précisions et les noms des organismes  correspondants
for i in range(num_files):
    X, y = extract_features(test_dataframes[i], best_params['p'], best_params['q'])
    y_pred = best_model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    print(f"Accuracy for {file_names[i]}: {accuracy}")
    print(f"Precision for {file_names[i]}: {precision}")
    print(f"Recall for {file_names[i]}: {recall}")
    print(f"F1-score for {file_names[i]}: {f1}")
    print("--------------------------------------------------------------")
    print("\n")

Accuracy for Euk: 0.805119926199262
Precision for Euk: 0.5084745762711864
Recall for Euk: 0.03546099290780142
F1-score for Euk: 0.06629834254143646
--------------------------------------------------------------


Accuracy for Gneg: 0.8020713463751439
Precision for Gneg: 0.38333333333333336
Recall for Gneg: 0.027186761229314422
F1-score for Gneg: 0.05077262693156733
--------------------------------------------------------------


Accuracy for Gpos: 0.9692439128577531
Precision for Gpos: 0.9821428571428571
Recall for Gpos: 0.8451536643026005
F1-score for Gpos: 0.9085133418043202
--------------------------------------------------------------




In [51]:
sequence = 'MKKKTLSLFVGLMLLIGLLFSGSLPYNPNAAEASSSASVKGDVIYQIIIDRFYDGDTTNNNPA'
site = predict_single_site(sequence, best_model, best_params['p'], best_params['q'])
print("Site de clivage prédit:", site)

Site de clivage prédit: 34


In [52]:
mot = 'SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSC'
print(len(mot))

34


In [20]:
print(len(range(15)))

15
