In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Load the dataset
df = pd.read_csv("/Users/zhengfeibian/Desktop/5630final/MyOwnChooseDataSets/Datasets_SYSEN5630FinalProject_Zhengfei/Symptom_Forgetfulness_Datasets/alzheimers_abstracts__symptoms.csv")  # 修改成你实际路径

# Randomly sample 20 abstracts, using a fixed random seed for reproducibility
sample_df = df.sample(n=20, random_state=42)  
sample_texts = sample_df["Abstract"].tolist() # Extract the abstract texts
sample_labels = sample_df["MentionsSymptom"].tolist() # Extract the corresponding labels (0 or 1)

# Split the data into a training set and a temporary set (for dev/test)
from sklearn.model_selection import train_test_split
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["MentionsSymptom"]) 

X_train = train_df["Abstract"] # Training features (abstract texts)
y_train = train_df["MentionsSymptom"] # Training labels (mention symptom or not)

# Convert the text into TF-IDF features, keeping only the top 5000 frequent words
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)

# Initialize and fit the model, allowing up to 200 iterations for convergence
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec, y_train)

# Vectorize the sampled abstracts using the trained TF-IDF vectorizer
X_sample_vec = vectorizer.transform(sample_texts)

# Predict the labels for the sampled abstracts
sample_preds = clf.predict(X_sample_vec)

# Merge the original abstracts, true labels, and predicted labels into a single DataFrame
results_df = pd.DataFrame({
    "Abstract Snippet": sample_texts,
    "True Label (MentionsSymptom)": sample_labels,
    "Logistic Regression Prediction": sample_preds
})

# Display the resulting DataFrame
pd.set_option('display.max_colwidth', 200) 
print(results_df)


                                                                                                                                                                                           Abstract Snippet  \
0   Macrophages accumulate lipid droplets (LDs) under stress and inflammatory conditions. Despite the presence of LD-loaded macrophages in many tissues, including the brain, their contribution to neur...   
1   The endoplasmic reticulum (ER) plays a fundamental role in maintaining cellular homeostasis by ensuring proper protein folding, lipid metabolism, and calcium regulation. However, disruptions to ER...   
2   C-truncating variants in the charged multivesicular body protein 2B (CHMP2B) gene are a rare cause of frontotemporal lobar degeneration (FTLD), previously identified only in Denmark, Belgium, and ...   
3   Sex differences in patterns of cortical thickness and neuropsychiatric symptom (NPS) burden were examined among individuals with Alzheimer's disease (AD) and two copies