In [5]:
import pandas as pd
import numpy as np
import re
import json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
with open("synthetic_dataset_with_notes.json", "r") as file:
    data = json.load(file)

df = pd.DataFrame(data)

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['processed_note'] = df['note'].apply(preprocess_text)

# Tokenization and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['processed_note'])
sequences = tokenizer.texts_to_sequences(df['processed_note'])
X = pad_sequences(sequences, maxlen=100)
y = df['label'].values

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Definition
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Training
history = model.fit(X_train, y_train, batch_size=64, epochs=20, validation_split=0.2, callbacks=[early_stopping])

# Evaluation
predictions = model.predict(X_test)
predictions = [1 if p > 0.5 else 0 for p in predictions]

accuracy = accuracy_score(y_test, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='binary')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

# Save the model
joblib.dump(model, 'autism_classifier.joblib')

# Save the vectorizer
joblib.dump(tokenizer, 'tfidf_vectorizer.joblib')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayesharahman1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayesharahman1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Accuracy: 0.9980
Precision: 1.0000
Recall: 0.9959
F1-Score: 0.9980
INFO:tensorflow:Assets written to: ram://ba28d2ba-00d9-41b9-ab16-9fe483d9090c/assets


['tfidf_vectorizer.joblib']