In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.preprocessing import LabelEncoder
from scipy.stats import shapiro
from statsmodels.graphics.gofplots import qqplot
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_recall_curve, auc, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings

warnings.filterwarnings('ignore')

# Number of samples in each group
num_demented = 2250
num_nondemented = 2250

# Generate random data for each column
group = ['Demented'] * num_demented + ['Nondemented'] * num_nondemented
gender = np.random.choice(['M', 'F'], size=num_demented + num_nondemented)
age = np.random.randint(30, 90, size=num_demented + num_nondemented)
education = np.random.randint(6, 20, size=num_demented + num_nondemented)
ses_demented = np.random.randint(1, 7, size=num_demented)  # SES values for Demented individuals (1-6)
ses_nondemented = np.random.randint(7, 11, size=num_nondemented)  # SES values for Nondemented individuals (7-10)

# Generate MMSE values based on group
mmse_demented = np.random.randint(17, 25, size=num_demented)
mmse_nondemented = np.random.randint(26, 31, size=num_nondemented)

# Concatenate MMSE values
mmse = np.concatenate((mmse_demented, mmse_nondemented))

# Generate CDR values based on group
cdr_demented = np.random.choice([1, 2, 3], size=num_demented)
cdr_nondemented = np.random.choice([0, 0.5], size=num_nondemented)

# Concatenate CDR values
cdr = np.concatenate((cdr_demented, cdr_nondemented))

eTIV = np.random.randint(1000, 2000, size=num_demented + num_nondemented)

# Generate nWBV values based on group
nwbv_demented = np.random.uniform(0.4, 0.6, size=num_demented)  # nWBV values for Demented individuals (0.4-0.6)
nwbv_nondemented = np.random.uniform(0.7, 0.9, size=num_nondemented)  # nWBV values for Nondemented individuals (0.7-0.9)
nwbv = np.concatenate((nwbv_demented, nwbv_nondemented))

asf = np.random.uniform(0.5, 2.0, size=num_demented + num_nondemented)

# Create a DataFrame
data = pd.DataFrame({
    'Group': group,
    'M/F': gender,
    'Age': age,
    'EDUC': education,
    'SES': np.concatenate((ses_demented, ses_nondemented)),  # Concatenating SES values
    'MMSE': mmse,
    'CDR': cdr,
    'eTIV': eTIV,
    'nWBV': nwbv,
    'ASF': asf
})

# Save the DataFrame to a CSV file
data.to_csv('alzheimer_dataset.csv', index=False)
data.head()

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,Demented,M,34,12,2,23,1.0,1252,0.431528,1.284409
1,Demented,F,38,6,5,19,1.0,1199,0.562259,1.1899
2,Demented,M,79,11,2,18,1.0,1833,0.577515,0.681508
3,Demented,M,46,16,2,18,3.0,1149,0.425874,1.268671
4,Demented,M,64,6,6,19,1.0,1307,0.468773,0.898102
