<a href="https://colab.research.google.com/github/Ademiday00/FYP/blob/master/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch scikit-learn




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertModel
import torch.nn.functional as F

# Sample DataFrame for the dataset
data = {
    'Age': [50, 65, 58, 45, 60],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'BMI': [30, 28, 35, 32, 31],
    'Pain_Score': [6, 4, 7, 3, 5],
    'Medical_History': ['Previous knee injury', 'No injury', 'Surgery', 'No injury', 'Previous knee injury'],
    'Joint_Mobility': [110, 90, 95, 100, 105],
    'Physical_Activity': ['Active', 'Sedentary', 'Active', 'Active', 'Sedentary'],
    'Radiographic_Grade': [3, 2, 4, 1, 2],
    'Joint_Space_Width': [3.5, 4.2, 2.9, 3.8, 3.2],
    'Osteophyte_Presence': ['Yes', 'No', 'Yes', 'No', 'Yes'],
    'Sclerosis': ['No', 'Yes', 'Yes', 'No', 'Yes']
}

df = pd.DataFrame(data)

# 1. Preprocess Categorical Features using LabelEncoder
label_encoders = {}
for col in ['Gender', 'Medical_History', 'Physical_Activity', 'Osteophyte_Presence', 'Sclerosis']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Encode labels (e.g., Male = 0, Female = 1)
    label_encoders[col] = le

# 2. Normalize Numerical Features (Age, BMI, Pain_Score, Joint_Mobility, Joint_Space_Width)
scaler = StandardScaler()
df[['Age', 'BMI', 'Pain_Score', 'Joint_Mobility', 'Joint_Space_Width']] = scaler.fit_transform(
    df[['Age', 'BMI', 'Pain_Score', 'Joint_Mobility', 'Joint_Space_Width']])

# 3. Extract Text Embeddings using BioBERT (from 'Medical_History' and 'Physical_Activity')
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
biobert = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

def get_biobert_embeddings(text):
    # Ensure text is a string and apply tokenization
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=64)
    outputs = biobert(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Apply BioBERT for text embeddings
df['Medical_History_Embeddings'] = df['Medical_History'].apply(lambda x: get_biobert_embeddings(str(x)).detach().numpy())
df['Physical_Activity_Embeddings'] = df['Physical_Activity'].apply(lambda x: get_biobert_embeddings(str(x)).detach().numpy())

# 4. Prepare Features (X) and Target (y)
X = df.drop(columns=['Radiographic_Grade', 'Medical_History', 'Physical_Activity']).values
y = df['Radiographic_Grade'].values

# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Define the Hybrid Model (BioBERT + Bi-LSTM + CNN)
class HybridModel(nn.Module):
    def __init__(self, input_size, embedding_dim=768, lstm_hidden_dim=128):
        super(HybridModel, self).__init__()

        # BioBERT output layer (embedding size from BioBERT is 768)
        self.bilstm = nn.LSTM(input_size=embedding_dim, hidden_size=lstm_hidden_dim, num_layers=1, bidirectional=True, batch_first=True)

        # CNN layer for extracting features
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=64, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=2)

        # Fully connected layers for classification
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, 5)  # Output 5 classes (Radiographic Grade 0-4)

    def forward(self, x):
        # Bi-LSTM for sequential learning of the embeddings
        lstm_out, _ = self.bilstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the last LSTM output

        # CNN for feature extraction from LSTM output
        x = x.permute(0, 2, 1)  # Adjust dimensions for CNN (Batch, Channels, Sequence Length)
        x = self.pool(F.relu(self.conv1(x)))

        x = x.view(x.size(0), -1)  # Flatten the CNN output
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x

# Define model
cnn_model = HybridModel(input_size=X_train.shape[1])

# 6. Training the Model
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)
#Run before conversion
for i, x in enumerate(X_train):
    if hasattr(x, 'shape'):
        print(f"Sample {i} has shape {x.shape}")
    else:
        print(f"Sample {i} has no shape attribute")
#Ensure x_train is a proper 2D array
#  Expected shape of each sample after flattening
expected_shape = 10

X_train_cleaned = []
for i, x in enumerate(X_train):
    try:
        x_array = np.array(x).flatten()
        if x_array.shape[0] != expected_shape:
            print(f"Skipping sample {i}: unexpected shape {x_array.shape}")
            continue
        X_train_cleaned.append(x_array)
    except Exception as e:
        print(f"Skipping sample {i} due to error: {e}")

# Convert to torch tensors
X_train_tensor_fixed = torch.tensor(X_train_fixed, dtype=torch.float32)
y_train_tensor_fixed = torch.tensor(y_train_fixed, dtype=torch.long)
# X_test_tensor = torch.tensor(X_test_fixed, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test, dtype=torch.long)
 #Modify the HybridModel input size to match the flattened features
cnn_model = HybridModel(input_size=X_train_tensor.shape[1])
# Training loop
epochs = 10
for epoch in range(epochs):
    cnn_model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = cnn_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass
    loss.backward()
    optimizer.step()

    # Print loss every 2 epochs
    if (epoch + 1) % 2 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# 7. Evaluation
cnn_model.eval()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

with torch.no_grad():
    outputs = cnn_model(X_test_tensor)
    _, predicted = torch.max(outputs, 1)

# Accuracy
accuracy = accuracy_score(y_test, predicted.numpy())
print(f"Accuracy of the hybrid model: {accuracy:.4f}")

# 8. Visualization: Confusion Matrix
conf_matrix = confusion_matrix(y_test, predicted.numpy())
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2, 3, 4], yticklabels=[0, 1, 2, 3, 4])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

Sample 0 has shape (10,)
Sample 1 has shape (10,)
Sample 2 has shape (10,)
Sample 3 has shape (10,)


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
from google.colab import drive
drive.mount('/content/drive')