In [None]:
# **1. Data Loading and Overview**

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt_tab')
from sklearn.model_selection import train_test_split


# Load dataset
data = pd.read_csv('NLP_Dataset_Extended.csv')
data.rename(columns={'Input': 'Health_Record', 'Prediction': 'Risk_Level'}, inplace=True)

print("\n✅ Dataset Overview:")
print(data.info())

# Class distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Risk_Level', data=data, palette='coolwarm')
plt.title('Class Distribution')
plt.show()

# Train-validation-test split
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['Risk_Level'], random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, stratify=train_data['Risk_Level'], random_state=42)


In [None]:
# **2. Preprocessing**

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    important_terms = {"diabetes", "cholesterol", "blood", "pressure", "heart", "disease", "smoker"}
    tokens = [word for word in tokens if word not in stop_words or word in important_terms]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply to all splits
train_data['processed_text'] = train_data['Health_Record'].apply(preprocess_text)
val_data['processed_text'] = val_data['Health_Record'].apply(preprocess_text)
test_data['processed_text'] = test_data['Health_Record'].apply(preprocess_text)


In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

def preprocess_text(text):
    # Sentence Tokenization
    sentences = sent_tokenize(text)

    all_tokens = []
    for sentence in sentences:
        # Word Tokenization and other preprocessing steps
        text = sentence.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        tokens = word_tokenize(text)
        important_terms = {"diabetes", "cholesterol", "blood", "pressure", "heart", "disease", "smoker"}
        tokens = [word for word in tokens if word not in stop_words or word in important_terms]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        all_tokens.extend(tokens) # Add tokens from this sentence

    return ' '.join(all_tokens)

In [None]:
from transformers import BertTokenizer, TFBertModel
# ... (Import statements and other code)

bert_model = TFBertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Fine-tuning BERT
bert_model.trainable = True  # Make BERT layers trainable
# ... (Rest of your model training code)

In [None]:
def get_bert_embeddings(texts, tokenizer, model, max_len=128):
    # ... (Tokenization and BERT model call)

    # Average token embeddings for sentence-level representation
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
    return embeddings.numpy()

In [None]:
# @title
# **3. BERT Embeddings**

from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import numpy as np

bert_model = TFBertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, tokenizer, model, max_len=128):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=max_len)
    outputs = model(tokens)
    embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
    return embeddings.numpy()

X_train_embeddings = get_bert_embeddings(train_data['processed_text'].tolist(), tokenizer, bert_model)
X_val_embeddings = get_bert_embeddings(val_data['processed_text'].tolist(), tokenizer, bert_model)
X_test_embeddings = get_bert_embeddings(test_data['processed_text'].tolist(), tokenizer, bert_model)


In [None]:


from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, LSTM, Bidirectional, Flatten, Input
from keras.callbacks import EarlyStopping

# Encode target labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['Risk_Level'])
y_val = label_encoder.transform(val_data['Risk_Level'])
y_test = label_encoder.transform(test_data['Risk_Level'])

# Reshape for CNN/LSTM input
X_train = X_train_embeddings.reshape((X_train_embeddings.shape[0], 1, X_train_embeddings.shape[1]))
X_val = X_val_embeddings.reshape((X_val_embeddings.shape[0], 1, X_val_embeddings.shape[1]))
X_test = X_test_embeddings.reshape((X_test_embeddings.shape[0], 1, X_test_embeddings.shape[1]))

input_shape = (1, 768)

# Class imbalance weights
class_weights = {0: 1.0, 1: 308 / 92}

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

def cnn_bilstm_model(input_shape):
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train the model
model = cnn_bilstm_model(input_shape)
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=64,
          class_weight=class_weights, callbacks=[early_stopping])


In [None]:
# **5. Evaluation**

from sklearn.metrics import accuracy_score, classification_report

preds = model.predict(X_test)
acc = accuracy_score(y_test, preds.round())
print(f"\n✅ BERT + CNN-BiLSTM Accuracy: {acc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, preds.round()))


In [None]:
# **6. Save & Predict**

model.save('bert_cnn_bilstm_model.h5')

# Predict new input
def predict_risk_level(text, tokenizer, bert_model, model):
    processed = preprocess_text(text)
    emb = get_bert_embeddings([processed], tokenizer, bert_model)
    emb = emb.reshape(1, 1, 768)
    pred = model.predict(emb)
    label_map = {0: "low-risk", 1: "high-risk"}
    return label_map[int(pred.round()[0][0])], pred[0][0]

example = "Patient is a 60-year-old smoker with high cholesterol and blood pressure."
label, probability = predict_risk_level(example, tokenizer, bert_model, model)
print(f"Predicted Class: {label} (Probability: {probability:.2f})")
