In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from lime.lime_text import LimeTextExplainer

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv('../data/Medical_Random_Sentences_Dataset.csv', delimiter=";")  # Update the path to your CSV file

# Preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

data['Preprocessed'] = data['Sentence'].apply(preprocess_text)

# Vectorize the text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Preprocessed'])
y = data['Label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Create a pipeline (include vectorization inside the model for LIME)
pipeline = make_pipeline(vectorizer, model)

# Predict and evaluate
y_pred = pipeline.predict(data['Preprocessed'][y_test.index])  # Ensure we use preprocessed text for prediction
print(classification_report(y_test, y_pred))

# Create a LIME Text Explainer
explainer = LimeTextExplainer(class_names=[ 'medical','random'])

# Choose a particular instance to explain
idx = 5  # Index of the sentence you want to explain
exp = explainer.explain_instance(data.iloc[idx]['Sentence'], pipeline.predict_proba, num_features=10)

# Show the explanation
print('Document ID: %d' % idx)
print('Probability(random) =', pipeline.predict_proba([data.iloc[idx]['Sentence']])[0,1])
print('True class: %s' % data.iloc[idx]['Label'])
exp.show_in_notebook(text=True)

ModuleNotFoundError: No module named 'nltk'