In [None]:
pip install opendatasets



In [None]:
!git clone https://github.com/AndrewDiv/FCALC

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import opendatasets as od
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

In [None]:
od.download("https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset")
od.download("https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset")
od.download("https://www.kaggle.com/datasets/adityakadiwal/water-potability")

In [None]:
file1 = ('/content/heart-attack-analysis-prediction-dataset/heart.csv')
heart_df = pd.read_csv(file1)

file2 = ('/content/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
stroke_df = pd.read_csv(file2)

file3 = ('/content/water-potability/water_potability.csv')
water_df = pd.read_csv(file3)

In [None]:
from FCALC import fcalc

# **Preprocessing**

In [None]:
heart_df.head()

In [None]:
heart_df.isnull().sum()

In [None]:
stroke_df.head()

In [None]:
stroke_df.drop('id', axis=1, inplace=True)

In [None]:
stroke_df.isnull().sum()

In [None]:
#Dealing with Categorical variables in stroke dataset
stroke_df['bmi'] = stroke_df['bmi'].fillna(stroke_df['bmi'].mean())

stroke_df['ever_married'] = stroke_df['ever_married'].replace(['No', 'Yes'], [0, 1])
stroke_df['Residence_type'] = stroke_df['Residence_type'].replace(['Rural', 'Urban'], [0, 1])

stroke_df = pd.get_dummies(stroke_df, columns=['gender', 'work_type', 'smoking_status'])

In [None]:
strk = stroke_df.pop('stroke')

In [None]:
stroke_df.insert(19, 'stroke', strk)

In [None]:
water_df.head()

In [None]:
water_df.isnull().sum()

In [None]:
#Dealing with Categorical variables in water dataset
water_df['ph'] = water_df['ph'].fillna(water_df['ph'].mean())
water_df['Sulfate'] = water_df['Sulfate'].fillna(water_df['Sulfate'].mean())
water_df['Trihalomethanes'] = water_df['Trihalomethanes'].fillna(water_df['Trihalomethanes'].mean())

# **Decision Tree**

In [None]:
def tune_decision_tree(dataset, target_column_name):
    # Split the data into features (x) and the target variable (y)
    x = dataset.drop(columns=[target_column_name])
    y = dataset[target_column_name]

    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # Create a DecisionTreeClassifier
    classifier = DecisionTreeClassifier()

    # Define a parameter grid for tuning
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create a GridSearchCV object with 5-fold cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

    # Fit the GridSearchCV object to the training data
    grid_search.fit(x_train, y_train)

    # Get the best parameters and estimator from the grid search
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Assess the model using 5-fold cross-validation and print accuracy and F1 scores
    accuracy_scores = cross_val_score(best_estimator, x_train, y_train, cv=5, scoring='accuracy')
    f1_scores = cross_val_score(best_estimator, x_train, y_train, cv=5, scoring='f1_weighted')

    print("Best Parameters:", best_params)
    print("Cross-Validation Accuracy:", round(np.mean(accuracy_scores),3))
    print("Cross-Validation F1 Score:", round(np.mean(f1_scores),3))

    # Fit the best model on the entire training set
    best_estimator.fit(x_train, y_train)

    # Evaluate the model on the test set
    y_pred = best_estimator.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_f1_score = f1_score(y_test, y_pred, average='weighted')

    print("Test Set Accuracy:", round(test_accuracy,3))
    print("Test Set F1 Score:", round(test_f1_score,3))

In [None]:
tune_decision_tree(heart_df, 'output')

In [None]:
tune_decision_tree(stroke_df, 'stroke')

In [None]:
tune_decision_tree(water_df, 'Potability')

# **Random Forest**

In [None]:
def tune_random_forest(dataset, target_column_name):
    # Split the data into features (x) and the target variable (y)
    x = dataset.drop(columns=[target_column_name])
    y = dataset[target_column_name]

    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # Create a RandomForestClassifier
    classifier = RandomForestClassifier()

    # Define a parameter grid for tuning
    param_grid = {
        'n_estimators': [25, 50, 100],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Create a GridSearchCV object with 5-fold cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

    # Fit the GridSearchCV object to the training data
    grid_search.fit(x_train, y_train)

    # Get the best parameters and estimator from the grid search
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Assess the model using 5-fold cross-validation and print accuracy and F1 scores
    accuracy_scores = cross_val_score(best_estimator, x_train, y_train, cv=5, scoring='accuracy')
    f1_scores = cross_val_score(best_estimator, x_train, y_train, cv=5, scoring='f1_weighted')

    print("Best Parameters:", best_params)
    print("Cross-Validation Accuracy:", round(np.mean(accuracy_scores),3))
    print("Cross-Validation F1 Score:", round(np.mean(f1_scores),3))

    # Fit the best model on the entire training set
    best_estimator.fit(x_train, y_train)

    # Evaluate the model on the test set
    y_pred = best_estimator.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_f1_score = f1_score(y_test, y_pred, average='weighted')

    print("Test Set Accuracy:", round(test_accuracy,3))
    print("Test Set F1 Score:", round(test_f1_score,3))

In [None]:
tune_random_forest(heart_df, 'output')

In [None]:
tune_random_forest(stroke_df, 'stroke')

In [None]:
tune_random_forest(water_df, 'Potability')

# **Logistic Regression**

In [None]:
def tune_logistic_regression(dataset, target_column_name):
    # Split the data into features (x) and the target variable (y)
    x = dataset.drop(columns=[target_column_name])
    y = dataset[target_column_name]

    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    #Scaling the data
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.fit_transform(x_test)

    # Create a LogisticRegression classifier
    classifier = LogisticRegression()

    # Define a parameter grid for tuning
    param_grid = {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [500, 750, 1000]
    }

    # Create a GridSearchCV object with 5-fold cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

    # Fit the GridSearchCV object to the training data
    grid_search.fit(x_train, y_train)

    # Get the best parameters and estimator from the grid search
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Assess the model using 5-fold cross-validation and print accuracy and F1 scores
    accuracy_scores = cross_val_score(best_estimator, x_train, y_train, cv=5, scoring='accuracy')
    f1_scores = cross_val_score(best_estimator, x_train, y_train, cv=5, scoring='f1_weighted')

    print("Best Parameters:", best_params)
    print("Cross-Validation Accuracy:", round(np.mean(accuracy_scores),3))
    print("Cross-Validation F1 Score:", round(np.mean(f1_scores),3))

    # Fit the best model on the entire training set
    best_estimator.fit(x_train, y_train)

    # Evaluate the model on the test set
    y_pred = best_estimator.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_f1_score = f1_score(y_test, y_pred, average='weighted')

    print("Test Set Accuracy:", round(test_accuracy,3))
    print("Test Set F1 Score:", round(test_f1_score,3))

In [None]:
tune_logistic_regression(heart_df, 'output')

In [None]:
tune_logistic_regression(stroke_df, 'stroke')

In [None]:
tune_logistic_regression(water_df, 'Potability')

# **k-NN**

In [None]:
def tune_knn(dataset, target_column_name, scaling=True, k_values=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]):
    # Split the data into features (x) and the target variable (y)
    x = dataset.drop(columns=[target_column_name])
    y = dataset[target_column_name]

    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    #Scaling the data
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.fit_transform(x_test)

    # Create a KNeighborsClassifier
    classifier = KNeighborsClassifier()

    # Define a parameter grid for tuning
    param_grid = {
        'n_neighbors': k_values,  # Try different k values
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
    }

    # Create a GridSearchCV object with 5-fold cross-validation
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')

    # Fit the GridSearchCV object to the training data
    grid_search.fit(x_train, y_train)

    # Get the best parameters and estimator from the grid search
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Assess the model using 5-fold cross-validation and print accuracy and F1 scores
    accuracy_scores = cross_val_score(best_estimator, x_train, y_train, cv=5, scoring='accuracy')
    f1_scores = cross_val_score(best_estimator, x_train, y_train, cv=5, scoring='f1_weighted')

    print("Best Parameters:", best_params)
    print("Cross-Validation Accuracy:", round(np.mean(accuracy_scores),3))
    print("Cross-Validation F1 Score:", round(np.mean(f1_scores),3))

    # Fit the best model on the entire training set
    best_estimator.fit(x_train, y_train)

    # Evaluate the model on the test set
    y_pred = best_estimator.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_f1_score = f1_score(y_test, y_pred, average='weighted')

    print("Test Set Accuracy:", round(test_accuracy,3))
    print("Test Set F1 Score:", round(test_f1_score,3))

In [None]:
tune_knn(heart_df, 'output')

In [None]:
tune_knn(stroke_df, 'stroke')

In [None]:
tune_knn(water_df, 'Potability')

# **FCA Preprocessing**

In [None]:
# Creating a copy of the dataset for FCA binarization
heart_df_fca = heart_df.copy()

In [None]:
# Binarizing age based on position around the mean
heart_df_fca['age'] = (heart_df_fca['age'] >= heart_df['age'].mean()).astype(int)

# Binarizing chest pain type (cp) whether or not they belong to types (0) (heart related) or (1,2,3) (non heart related)
heart_df_fca['cp'] = heart_df_fca['cp'].apply(lambda x: 1 if x in [0] else 0)

# Binarizing resting blood pressure (trtbps) based on its position around normal blood pressure of humans being 120 mm/Hg
heart_df_fca['trtbps'] = (heart_df_fca['trtbps'] > 120).astype(int)

# Binarizing cholestrol (chol) based on its position around normal cholestrol level which is under 200 mg/dL
heart_df_fca['chol'] = (heart_df_fca['chol'] >= 200).astype(int)

# Binarizing resting electrocardiographic results (restecg) based on whether it is normal (1) or not (0,2)
heart_df_fca['restecg'] = heart_df_fca['restecg'].apply(lambda x: 1 if x in [0, 2] else 0)

# Binarizing maximum heart rate achieved (thalachh) based on theoretical max heart rate calculated by (220-age)
heart_df_fca['thalachh'] = (heart_df_fca['thalachh'] > (220-heart_df['age'])).astype(int)

# Binarizing oldpeak based on position around the mean
heart_df_fca['oldpeak'] = (heart_df_fca['oldpeak'] >= heart_df['oldpeak'].mean()).astype(int)

# Binarizing slope based on flat/positive (1,2) slope and negative slope (0)
heart_df_fca['slp'] = heart_df_fca['slp'].apply(lambda x: 0 if x in [1, 2] else 1)

# Binarizing number of major vessels based on whether there are any or not
heart_df_fca['caa'] = heart_df_fca['caa'].apply(lambda x: 0 if x in [1, 2, 3] else 1)

# Binarizing thalassemia based on whether there is normal bloodflow (2) or not (0,1,3)
heart_df_fca['thall'] = heart_df_fca['thall'].apply(lambda x: 1 if x in [0, 1, 3] else 0)

In [None]:
heart_df_fca.head()

In [None]:
# Creating a copy of the dataset for FCA binarization
stroke_df_fca = stroke_df.copy()

In [None]:
# Binarizing age based on position around the mean
stroke_df_fca['age'] = (stroke_df_fca['age'] >= stroke_df['age'].mean()).astype(int)

# Binarizing average glucose level based on position around the mean
stroke_df_fca['avg_glucose_level'] = (stroke_df_fca['avg_glucose_level'] > stroke_df['avg_glucose_level'].mean()).astype(int)

# Binarizing bmi around the max normal limit 24.9
stroke_df_fca['bmi'] = (stroke_df_fca['bmi'] > 24.9).astype(int)

In [None]:
stroke_df_fca.head()

In [None]:
# Creating a copy of the dataset for FCA binarization
water_df_fca = water_df.copy()

In [None]:
# Binarizing pH based on pH range of drinking water between 6.5 and 8.5
water_df_fca['ph'] = (water_df['ph'] > 6.5) & (water_df['ph'] < 8.5)
water_df_fca['ph'] = water_df_fca['ph'].astype(int)

# Binarizing Hardness based on position around the mean
water_df_fca['Hardness'] = (water_df_fca['Hardness'] < water_df['Hardness'].mean()).astype(int)

# Binarizing Solids based on position around the mean
water_df_fca['Solids'] = (water_df_fca['Solids'] < water_df['Solids'].mean()).astype(int)

# Binarizing Chloramines based on position around the mean
water_df_fca['Chloramines'] = (water_df_fca['Chloramines'] < water_df['Chloramines'].mean()).astype(int)

# Binarizing Sulfate based on position around the mean
water_df_fca['Sulfate'] = (water_df_fca['Sulfate'] < water_df['Sulfate'].mean()).astype(int)

# Binarizing Conductivity based on position around the mean
water_df_fca['Conductivity'] = (water_df_fca['Conductivity'] < water_df['Conductivity'].mean()).astype(int)

# Binarizing Organic Carbon amount based on position around the mean
water_df_fca['Organic_carbon'] = (water_df_fca['Organic_carbon'] < water_df['Organic_carbon'].mean()).astype(int)

# Binarizing Trihalomethanes based on position around the mean
water_df_fca['Trihalomethanes'] = (water_df_fca['Trihalomethanes'] < water_df['Trihalomethanes'].mean()).astype(int)

# Binarizing Turbidity based on position around the mean
water_df_fca['Turbidity'] = (water_df_fca['Turbidity'] < water_df['Turbidity'].mean()).astype(int)

In [None]:
water_df_fca.head()

# **FCA Lazy Binary Classification**

In [None]:
def tune_LazyBinaryClassifier(dataset, target_column_name, method='standard'):
  # Define the number of folds for cross-validation
  num_folds = 5
  kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

  # Split the data into features (x) and the target variable (y)
  x = dataset.drop(columns=[target_column_name])
  y = dataset[target_column_name]

  # Split the data into training and testing sets
  x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.2, random_state=42)

  # Initialize lists to store results
  accuracies = []
  weighted_f1_scores = []

  # Perform k-fold cross-validation
  for train_index, val_index in kf.split(x_tr, y_tr):
    x_train, x_val = x_tr.iloc[train_index], x_tr.iloc[val_index]
    y_train, y_val = y_tr.iloc[train_index], y_tr.iloc[val_index]

    # Fit the classifier on the training set
    bin_cls = fcalc.classifier.BinarizedBinaryClassifier(x_train.values, y_train.values, method=method)

    # Predict on the validation set
    bin_cls.predict(x_val.values)

    # Calculate accuracy and weighted F1 score
    accuracy = accuracy_score(y_val, bin_cls.predictions)
    weighted_f1 = f1_score(y_val, bin_cls.predictions, average='weighted')

    # Append results to lists
    accuracies.append(accuracy)
    weighted_f1_scores.append(weighted_f1)

  # Calculate and print average metrics over all folds
  average_accuracy = sum(accuracies) / num_folds
  average_weighted_f1 = sum(weighted_f1_scores) / num_folds

  print("Cross-Validation Accuracy:", round(average_accuracy, 3))
  print("Cross-Validation F1 Score:", round(average_weighted_f1, 3))

  # Predict on a separate test set
  test_predictions = bin_cls.predict(x_ts.values)

  # Evaluate the model on the test set
  test_accuracy = accuracy_score(y_ts, bin_cls.predictions)
  test_f1_score = f1_score(y_ts, bin_cls.predictions, average='weighted')

  print("Test Set Accuracy:", round(test_accuracy, 3))
  print("Test Set F1 Score:", round(test_f1_score, 3))

In [None]:
tune_LazyBinaryClassifier(heart_df_fca, 'output')

In [None]:
tune_LazyBinaryClassifier(stroke_df_fca.iloc[0:2000], 'stroke', method='ratio-support')

In [None]:
tune_LazyBinaryClassifier(water_df_fca.iloc[500:4000], 'Potability', method='ratio-support')

# **FCA Pattern Classifier**

In [None]:
def tune_LazyPatternClassifier(dataset, target_column_name, cat_list=None, method='standard'):
  # Define the number of folds for cross-validation
  num_folds = 5
  kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

  # Split the data into features (x) and the target variable (y)
  x = dataset.drop(columns=[target_column_name])
  y = dataset[target_column_name]

  # Split the data into training and testing sets
  x_tr, x_ts, y_tr, y_ts = train_test_split(x, y, test_size=0.2, random_state=42)

  # Initialize lists to store results
  accuracies = []
  weighted_f1_scores = []

  # Perform k-fold cross-validation
  for train_index, val_index in kf.split(x_tr, y_tr):
    x_train, x_val = x_tr.iloc[train_index], x_tr.iloc[val_index]
    y_train, y_val = y_tr.iloc[train_index], y_tr.iloc[val_index]

    # Fit the classifier on the training set
    pat_cls = fcalc.classifier.PatternBinaryClassifier(x_train.values, y_train.to_numpy(), categorical=cat_list, method=method)

    # Predict on the validation set
    pat_cls.predict(x_val.values)

    # Calculate accuracy and weighted F1 score
    accuracy = accuracy_score(y_val, pat_cls.predictions)
    weighted_f1 = f1_score(y_val, pat_cls.predictions, average='weighted')

    # Append results to lists
    accuracies.append(accuracy)
    weighted_f1_scores.append(weighted_f1)

  # Calculate and print average metrics over all folds
  average_accuracy = sum(accuracies) / num_folds
  average_weighted_f1 = sum(weighted_f1_scores) / num_folds

  print("Cross-Validation Accuracy:", round(average_accuracy, 3))
  print("Cross-Validation F1 Score:", round(average_weighted_f1, 3))

  pat_cls = fcalc.classifier.PatternBinaryClassifier(x_tr.values, y_tr.to_numpy(), categorical=cat_list, method=method)

  # Predict on a separate test set
  pat_cls.predict(x_ts.values)

  # Evaluate the model on the test set
  test_accuracy = accuracy_score(y_ts, pat_cls.predictions)
  test_f1_score = f1_score(y_ts, pat_cls.predictions, average='weighted')

  print("Test Set Accuracy:", round(test_accuracy, 3))
  print("Test Set F1 Score:", round(test_f1_score, 3))

In [None]:
tune_LazyPatternClassifier(heart_df, 'output', np.array([1,2,5,6,8,10,11,12]), method='ratio-support')

In [None]:
tune_LazyPatternClassifier(stroke_df.iloc[0:1000], 'stroke', np.array([1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,18]), method='standard-support')

In [None]:
tune_LazyPatternClassifier(water_df.iloc[500:1500], 'Potability', method='standard')