In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = "Final_Dataset.csv"  # Replace with your file's path
data = pd.read_csv(file_path)

# Define the input (X) and output (y) features
input_features = [
    'Stage', 'Age', 'Male_or_Female', 'Hair_Color', 'Skin_Tone', 
    'Height', 'Color_Blindness', 'Weight', 'Eye_Colour', 'Eye_Sight', 
    'Vericose_Vein', 'Blood Pressure', 'Thyroid', 'Blood_Group', 
    'Allergy', 'Migrane', 'Nerve_Problem'
]

output_features = ['Sample_ID', 'Condition', 'RNA_Sequence', 'Varied_RNA_Sequence']

# Split dataset into input (X) and output (y)
X = data[input_features]
y = data[output_features]

# Split into training and testing datasets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("Training input shape:", X_train.shape)
print("Testing input shape:", X_test.shape)
print("Training output shape:", y_train.shape)
print("Testing output shape:", y_test.shape)


Training input shape: (320, 17)
Testing input shape: (80, 17)
Training output shape: (320, 4)
Testing output shape: (80, 4)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Encode categorical data (necessary for models to work)
X_encoded = X.copy()
y_encoded = y['Condition'].copy()  # Assuming Condition is the target variable

# Encode input features
label_encoders = {}
for col in X_encoded.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])
    label_encoders[col] = le

# Encode output feature (if categorical)
y_le = LabelEncoder()
y_encoded = y_le.fit_transform(y_encoded)

# Splitting again with encoded data
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC()
}

# Results storage
results = []

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train_enc, y_train_enc)
    
    # Predict on test data
    y_pred = model.predict(X_test_enc)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test_enc, y_pred)
    precision = precision_score(y_test_enc, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test_enc, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test_enc, y_pred, average='weighted')
    
    # Store results
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

# Create a DataFrame of results
results_df = pd.DataFrame(results)

# Find the best model based on F1 score
best_model = results_df.loc[results_df['F1 Score'].idxmax()]

print("Model Performance Metrics:")
print(results_df)
print("\nBest Model Based on F1 Score:")
print(best_model)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Performance Metrics:
                 Model  Accuracy  Precision  Recall  F1 Score
0  Logistic Regression    0.3125   0.326507  0.3125  0.311492
1        Random Forest    0.3375   0.348263  0.3375  0.322371
2        Decision Tree    0.4125   0.427209  0.4125  0.416061
3                  SVM    0.3000   0.090000  0.3000  0.138462

Best Model Based on F1 Score:
Model        Decision Tree
Accuracy            0.4125
Precision         0.427209
Recall              0.4125
F1 Score          0.416061
Name: 2, dtype: object


In [5]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

# Define the model
multi_output_model = MultiOutputClassifier(RandomForestClassifier())

# Fit the model on the training data
multi_output_model.fit(X_train_enc, y_train[output_features])

# Save the model to a file
model_file_path = "multi_output_model.pkl"
with open(model_file_path, 'wb') as file:
    pickle.dump(multi_output_model, file)

print(f"Model saved to {model_file_path}")


Model saved to multi_output_model.pkl


In [6]:
# Load the saved model
with open(model_file_path, 'rb') as file:
    loaded_model = pickle.load(file)

# User input
user_input = {
    'Stage': 'Blastocyst',
    'Age': 25,
    'Male_or_Female': 'Male',
    'Hair_Color': 'Brown',
    'Skin_Tone': 'Dark',
    'Height': 161,
    'Color_Blindness': 'No',
    'Weight': 62,
    'Eye_Colour': 'Blue',
    'Eye_Sight': 'Poor',
    'Vericose_Vein': 'No',
    'Blood Pressure': 'Low',
    'Thyroid': 'No',
    'Blood_Group': 'O+',
    'Allergy': 'Yes',
    'Migrane': 'Yes',
    'Nerve_Problem': 'Yes'
}

# Convert user input to DataFrame
user_input_df = pd.DataFrame([user_input])

# Encode user input
for col in user_input_df.select_dtypes(include=['object']).columns:
    if col in label_encoders:
        user_input_df[col] = label_encoders[col].transform(user_input_df[col])

# Predict the output
predicted_output = loaded_model.predict(user_input_df)

# Decode the predicted output if necessary
predicted_df = pd.DataFrame(predicted_output, columns=output_features)
predicted_df['Sample_ID'] = y['Sample_ID']  # Sample_ID is directly copied as it may not need prediction

print("Predicted Output:")
print(predicted_df)


Predicted Output:
  Sample_ID         Condition  \
0  Sample_1  Moderate Quality   

                                        RNA_Sequence  \
0  AAGUAGGAGUUAACCUUUGUGUUAAAGGUGAUUCAGGUAAUCCAAA...   

                                 Varied_RNA_Sequence  
0  AAGUAGGAGUAAGGUUUGUGUUAAGGUGAUGAGGUUGGAAAGGAGA...  
