<a href="https://colab.research.google.com/github/Bilal-Hijazi/Colab-Code/blob/main/mushrooms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv("mushrooms.csv")

# Drop missing values (optional, you might want to impute them instead)
data = data.dropna()

def convert_to_binary(df):

    # Iterate through each column
    for col in df.columns:
        # Check the number of unique values
        n_unique = df[col].nunique()

        if n_unique > 2:
            # Choose a value to represent 1 (adjust as needed)
            chosen_value = df[col].value_counts().index[0]  # Use the most frequent value

            df[col] = df[col].apply(lambda x: 1 if x == chosen_value else 0)
        else:
            df[col] = df[col].eq(df[col].iloc[0]).astype(int)  # Use first value as reference

    return df

# Apply the conversion function
data = convert_to_binary(data)

X = data.drop("class",axis=1)
y = data["class"]

# Split the dataset into train and test sets before any preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# For each r, create an imbalanced dataset, train, and record precision, recall, and F1 score
# Classifier used: random forest
results = {}
for r in np.arange(0.9, 0, -0.1):
    # Selecting fraction r of positive class instances
    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]

    # Creating imbalanced training dataset
    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]

    # Training the Random Forest Classifier
    clf = RandomForestClassifier(random_state=42, max_depth=5, min_samples_split=3)
    clf.fit(X_train_imbalanced, y_train_imbalanced)

    # Predictions on the test set
    y_pred = clf.predict(X_test)

    # Record precision, recall, and F1 score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store the results
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
pd.DataFrame(results)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load the dataset
data = pd.read_csv("mushrooms.csv")

# Drop missing values (optional, you might want to impute them instead)
data = data.dropna()

def convert_to_binary(df):

    # Iterate through each column
    for col in df.columns:
        # Check the number of unique values
        n_unique = df[col].nunique()

        if n_unique > 2:
            # Choose a value to represent 1 (adjust as needed)
            chosen_value = df[col].value_counts().index[0]  # Use the most frequent value

            df[col] = df[col].apply(lambda x: 1 if x == chosen_value else 0)
        else:
            df[col] = df[col].eq(df[col].iloc[0]).astype(int)  # Use first value as reference

    return df

# Apply the conversion function
data = convert_to_binary(data)

X = data.drop("class",axis=1)
y = data["class"]


# For each r, create an imbalanced dataset, train, and record precision, recall, and F1 score
# Classifier used: XGBoost
for r in np.arange(0.9, 0, -0.1):
    # Selecting fraction r of positive class instances
    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]

    # Creating imbalanced training dataset
    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]

    # Training The XGB Classifier
    xgb = XGBClassifier()
    xgb.fit(X_train_imbalanced, y_train_imbalanced)
    # Predictions and recording precision, recall, and F1 score
    y_pred = xgb.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
pd.DataFrame(results)

In [None]:
from sklearn import svm
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


data = pd.read_csv("mushrooms.csv")

# Drop missing values (optional, you might want to impute them instead)
data = data.dropna()

def convert_to_binary(df):

    # Iterate through each column
    for col in df.columns:
        # Check the number of unique values
        n_unique = df[col].nunique()

        if n_unique > 2:
            # Choose a value to represent 1 (adjust as needed)
            chosen_value = df[col].value_counts().index[0]  # Use the most frequent value

            df[col] = df[col].apply(lambda x: 1 if x == chosen_value else 0)
        else:
            df[col] = df[col].eq(df[col].iloc[0]).astype(int)  # Use first value as reference

    return df

# Apply the conversion function
data = convert_to_binary(data)

X = data.drop("class",axis=1)
y = data["class"]


results = {}


# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# For each r, create an imbalanced dataset, train, and record precision, recall, and F1 score
# Classifier used: Support Vector Machine (SVM)
for r in np.arange(0.9, 0, -0.1):
    # Selecting fraction r of positive class instances
    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]

    # Creating imbalanced training dataset
    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]

    # Training The SVM Classifier
    svmClassifier = svm.LinearSVC(random_state=42)  # Instantiate SVM classifier
    svmClassifier.fit(X_train_imbalanced, y_train_imbalanced)  # Train the SVM classifier

    # Predictions and recording precision, recall, and F1 score
    y_pred = svmClassifier.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
print(pd.DataFrame(results))

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd





data = pd.read_csv("mushrooms.csv")


def convert_to_binary(df):

    # Iterate through each column
    for col in df.columns:
        # Check the number of unique values
        n_unique = df[col].nunique()

        if n_unique > 2:
            # Choose a value to represent 1 (adjust as needed)
            chosen_value = df[col].value_counts().index[0]  # Use the most frequent value

            df[col] = df[col].apply(lambda x: 1 if x == chosen_value else 0)
        else:
            df[col] = df[col].eq(df[col].iloc[0]).astype(int)  # Use first value as reference

    return df

# Apply the conversion function
data = convert_to_binary(data)

X = data.drop("class",axis=1)
y = data["class"]


results = {}


# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)




for r in np.arange(0.9, 0, -0.1):
    # Select fraction r of positive class instances
    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]

    # Create imbalanced training dataset
    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]

    # Initialize and train the MLPClassifier
    mlp = MLPClassifier(random_state=42, max_iter=500)
    mlp.fit(X_train_imbalanced, y_train_imbalanced)

    # Predictions and recording precision, recall, and F1 score
    y_pred = mlp.predict(X_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
print(pd.DataFrame(results))


In [None]:
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd






for r in np.arange(0.9, 0, -0.1):

    positive_indices = np.where(y_train == 1)[0]
    np.random.shuffle(positive_indices)
    selected_positive_indices = positive_indices[:int(r * len(positive_indices))]
    negative_indices = np.where(y_train == 0)[0]


    imbalanced_train_indices = np.concatenate((selected_positive_indices, negative_indices))
    X_train_imbalanced = X_train.iloc[imbalanced_train_indices]
    y_train_imbalanced = y_train.iloc[imbalanced_train_indices]


    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_imbalanced.shape[1],)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])


    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Train the model
    model.fit(X_train_imbalanced, y_train_imbalanced, epochs=10, batch_size=32, verbose=0)

    # Predictions and recording precision, recall, and F1 score
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results[round(r, 1)] = {'Precision': round(precision, 3), 'Recall': round(recall, 3), 'F1 Score': round(f1, 3)}

# Print the results
print(pd.DataFrame(results))
