In [None]:
!pip install tensorflow scikit-learn
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X = df['Processed_Description'].values
y = df['MI_Incident'].values

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=5000)  # Adjust 'num_words' as per your vocabulary size
tokenizer.fit_on_texts(X)

# Convert text to sequences of integers
X_seq = tokenizer.texts_to_sequences(X)

# Padding sequences to ensure equal length
maxlen = 100  # Adjust as needed based on the length of your sentences
X_padded = pad_sequences(X_seq, padding='post', maxlen=maxlen)


In [None]:
# Prepare Word Embeddings:
#Option 1: Pre-trained embeddings
embeddings_index = {}
with open('glove.6B.100d.txt') as f:  # Download from https://nlp.stanford.edu/projects/glove/
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_dim = 100
embedding_matrix = np.zeros((5000, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < 5000:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            

In [None]:
# Option 2: Using Trainable Embeddings:
embedding_dim = 100
embedding_matrix = None  # No pre-trained embeddings

In [None]:
#Build and train the model

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=embedding_dim, 
                    input_length=maxlen, weights=[embedding_matrix], 
                    trainable=(embedding_matrix is None)))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

In [None]:
#Evaluate the model
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
#Saving the model
model.save('mi_classification_model.h5')