MACHINE LEARNING ANALYSIS

In [None]:
# importing the pandas numpy
import pandas as pd
import numpy as np

# finding present working directory
import os
os.getcwd()

# changing the working directory
os.chdir('/home/researchlab/Downloads/CODES/Data')

print(os.getcwd())

#checking the files in the directory
os.listdir()


In [None]:
#reading dataset file
df_union = pd.read_csv("9_dataset_subset_unique.csv")
print(df_union.shape)

#isolating the column names of the dataset
df_union_columns = df_union.columns
print(df_union_columns)

In [None]:
#creating X and Y for machine learning analysis by removing the ID, label and Sequence
X_union = df_union.drop(['ID','Label','Sequence'],axis=1)
Y_union = df_union['Label']
Y_union = np.ravel(Y_union)
print(X_union.shape, Y_union.shape)

In [None]:
#plotting X max values for each column
X_union.max().plot(kind='bar')

In [None]:
# isolating the X columns with maximum value greater than 1.0
df1 = X_union.loc[:, (X_union.max() > 1.0)]

#develop a for loop to divide each value in column by 100
for i in df1.columns:
    X_union[i] = df1[i].div(100)

df1.describe()
X_union.describe()

#plotting X max values for each column
X_union.max().plot(kind='bar')

In [None]:
# performing the train test split for unique and intersection dataset
from sklearn.model_selection import train_test_split

X_union_train, X_union_test, Y_union_train, Y_union_test = train_test_split(X_union, Y_union, test_size=0.2, random_state=99)
print(X_union_train.shape, X_union_test.shape, Y_union_train.shape, Y_union_test.shape)

In [None]:
# reading the validation file 
df_validation = pd.read_csv("Validation.csv")
print(df_validation.shape)

#creating a subset of validation dataset using columns stored in df_columns
df_validation = df_validation[df_union_columns]
print(df_validation.shape)

#creating X_val and Y_val for validation dataset by removing the ID, label and Sequence
X_val = df_validation.drop(['ID','Label','Sequence'],axis=1)
Y_val = df_validation['Label']
Y_val = np.ravel(Y_val)

#checking the shape of X_val and Y_val
print(X_val.shape)
print(Y_val.shape)

# splitting the validation dataset into train and test
X_val_train, X_val_test, Y_val_train, Y_val_test = train_test_split(X_val, Y_val, test_size=0.2, random_state=99)
print(X_val_train.shape, X_val_test.shape, Y_val_train.shape, Y_val_test.shape)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

# Defining the classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(Max depth=15, Min_sample_leaf=02, N_estimators=100, Max_features=15),
    'Logistic Regression': LogisticRegression(max_iter=100),
    
    'Support Vector Machine poly': SVC(kernel='poly', C=1.0, gamma='scale', random_state=42, probability=True),
    'Support Vector Machine rbf': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42, probability=True),
    'Support Vector Machine Linear': SVC(kernel='linear', C=1.0, random_state=42, probability=True),
    'Support Vector Machine Sigmoid': SVC(kernel='sigmoid', C=1.0, gamma='scale',random_state=42, probability=True),
    # using XGBoost classifier
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
    'Gaussian Naive Bayes': GaussianNB()
}

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn import metrics

results_list = []

# Loop through each classifier
for clf_name, clf in classifiers.items():
    print(f"\nTraining and evaluating {clf_name}...\n")
    
    # Training the model on the training set
    clf.fit(X_union_train, Y_union_train)
    
    # Testing set predictions
    Y_union_pred = clf.predict(X_union_test)
    
    # Model evaluation on the first validation set
    accuracy_test = metrics.accuracy_score(Y_union_test, Y_union_pred)
    precision_test = metrics.precision_score(Y_union_test, Y_union_pred)
    recall_test = metrics.recall_score(Y_union_test, Y_union_pred)
    specificity_test = metrics.recall_score(Y_union_test, Y_union_pred, pos_label=0)
    auroc_test = metrics.roc_auc_score(Y_union_test, clf.predict_proba(X_union_test)[:, 1])
    mcc_test = metrics.matthews_corrcoef(Y_union_test, Y_union_pred)
    f1_test = metrics.f1_score(Y_union_test, Y_union_pred)

    # Evaluation on the validation set
    Y_val_pred = clf.predict(X_val)
    accuracy_val = metrics.accuracy_score(Y_val, Y_val_pred)

    # 10-fold cross-validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    Stratified_accuracy_scores = cross_val_score(clf, X_union, Y_union, cv=cv, scoring='accuracy')
    Stratified_accuracy_scores = Stratified_accuracy_scores.round(2)

    # Save results to the DataFrame
    #results_df = results_df.append({
    results_list.append({
        'Classifier': clf_name,
        'Testing Accuracy': accuracy_test,
        'Testing Precision': precision_test,
        'Testing Recall': recall_test,
        'Testing Specificity': specificity_test,
        'Testing AUROC': auroc_test,
        'Testing MCC': mcc_test,
        'Testing F1 Score': f1_test,
        'Validation Accuracy': accuracy_val,
        'Stratified 10-fold CV Accuracy Scores': Stratified_accuracy_scores,
        'Mean Stratified CV Accuracy': np.mean(Stratified_accuracy_scores ),
        'Std Stratified CV Accuracy': np.std(Stratified_accuracy_scores )
    })
    
# Convert the list to a DataFrame
results_df = pd.DataFrame(results_list)

# Display the results DataFrame
print("\nResults DataFrame:")
print(results_df)
print(results_df.head)

In [None]:
#saving all the numbers to the second's decimal place
results_df = results_df.round(2)


In [None]:
results_df.head(7)

In [None]:
#saving results_df to csv file
results_df.to_csv('results_df.csv', index=False)


In [None]:
# developing a pkl file for random forest classifier used in the classifier method
import pickle
pickle.dump(classifiers['Random Forest'], open('random_forest.pkl','wb'))
