ADASYN and Near Miss Resampling

In [None]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

def sample_data(data, minority_class=1):
    """
    Samples the data in a Pandas DataFrame using both ADASYN and NearMiss.

    Args:
        data: A Pandas DataFrame containing the data (assumed to be loaded from a file).
        minority_class: The value representing the minority class in the target labels. (default: 1)

    Returns:
        A Pandas DataFrame containing the sampled data.
    """
    # Identify the target variable column name
    target_column_name = 'is_legendary'
    X = data.drop(target_column_name, axis=1)  # Drop the target variable column
    y = data[target_column_name]  # Get the target variable column

    # Identify categorical/string columns
    categorical_cols = X.dtypes == object
    categorical_col_names = X.columns[categorical_cols].tolist()

    # Reset the index of X before indexing with categorical_col_names
    X = X.reset_index(drop=True)

    # Convert categorical columns to one-hot encoded numerical columns
    encoder = OneHotEncoder(handle_unknown='ignore')
    X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_col_names]).toarray(), columns=encoder.get_feature_names_out(categorical_col_names))

    # Combine one-hot encoded columns with numerical columns
    X = pd.concat([X_encoded, X.drop(categorical_col_names, axis=1)], axis=1)

    # Create a pipeline with both ADASYN and NearMiss
    sampling_pipeline = Pipeline([
        ('adasyn', ADASYN(sampling_strategy='auto', random_state=42)),
        ('nearmiss', NearMiss(version=3, sampling_strategy='auto'))
    ])

    X_res, y_res = sampling_pipeline.fit_resample(X, y)
    sampled_data = pd.concat([X_res, pd.DataFrame(y_res, columns=[target_column_name])], axis=1)
    return sampled_data

# Example usage
try:
    # Assuming the data is loaded from a file (replace 'creditcard.csv' with the actual filename)
    data = pd.read_csv('pokemon_modified.csv')
    print(data.columns)  # Print the column names to identify the target variable column

    sampled_data = sample_data(data.copy())

    # Export the sampled data to a new CSV file (replace 'sampled_data.csv' with the desired filename)
    sampled_data.to_csv('pokemon_resampled.csv', index=False)
    print("Sampled data saved to 'pokemon_resampled_data.csv'.")
except FileNotFoundError:
    print("Error: Could not find the data file.")

Index(['against_bug', 'against_dark', 'against_dragon', 'against_electric',
       'against_fairy', 'against_fight', 'against_fire', 'against_flying',
       'against_ghost', 'against_grass', 'against_ground', 'against_ice',
       'against_normal', 'against_poison', 'against_psychic', 'against_rock',
       'against_steel', 'against_water', 'attack', 'base_egg_steps',
       'base_happiness', 'base_total', 'capture_rate', 'defense',
       'experience_growth', 'height_m', 'hp', 'percentage_male',
       'pokedex_number', 'sp_attack', 'sp_defense', 'speed', 'weight_kg',
       'generation', 'is_legendary', 'type1_en', 'type2_en'],
      dtype='object')
Sampled data saved to 'pokemon_resampled_data.csv'.




In [None]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Collecting keras>=3.2.0 (from scikeras)
  Downloading keras-3.3.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>=1.4.2 (from scikeras)
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
Collecting namex (from keras>=3.2.0->scikeras)
  Downloading namex-0.0.8-py3-none-any.whl (5.8 kB)
Collecting optree (from keras>=3.2.0->scikeras)
  Downloading optree-0.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: namex, optree, scikit-learn, keras, scike

Ensemble model of Logistic Regression and LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional, Concatenate, Input, Attention
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from keras.callbacks import LearningRateScheduler

# Define a learning rate schedule function
def lr_schedule(epoch):
    """
    Learning rate schedule function.
    Adjust the learning rate based on the epoch.
    """
    lr = 0.001  # Initial learning rate
    if epoch > 10:
        lr *= 0.5  # Reduce learning rate by half after 10 epochs
    return lr

# Create LearningRateScheduler callback
lr_scheduler = LearningRateScheduler(lr_schedule)

# Read the uploaded file into a pandas dataframe
df = pd.read_csv("pokemon_resampled.csv")

# Handle missing values if any
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Perform one-hot encoding for categorical variables
df = pd.get_dummies(df)
print(df)
print(df.columns)
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('is_legendary', axis=1), df['is_legendary'], test_size=0.33, random_state=17)

# Preprocess structured data using Logistic Regression
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor_lr = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define Logistic Regression model
lr_model = Pipeline(steps=[('preprocessor', preprocessor_lr), ('clf', LogisticRegression())])

# Train Logistic Regression model
lr_model.fit(X_train, y_train)

# Predict using Logistic Regression model
y_pred_lr = lr_model.predict(X_test[:y_test.shape[0]])

# Preprocess sequential data using LSTM
preprocessor_lstm = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the LSTM model input shape
input_shape = (X_train.shape[1], 1)  # (number_of_features, 1)

# Define the LSTM model using the correct input shape
lstm_model = Sequential([
    LSTM(units=64, input_shape=input_shape),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Now, fit the model with the callbacks and other parameters
lstm_model.fit(X_train, y_train, epochs=30, batch_size=64, callbacks=[lr_scheduler], verbose=0)

# Predict using LSTM model
y_pred_lstm = (lstm_model.predict(X_test[:y_test.shape[0]]) > 0.5).astype("int32").reshape(-1)

# Print shapes for debugging
print("Shape of y_test:", y_test.shape)
print("Shape of y_pred_lr:", y_pred_lr.shape)
print("Shape of y_pred_lstm:", y_pred_lstm.shape)

# Concatenate predictions from LR and LSTM models
concatenated_predictions = np.column_stack((y_pred_lr, y_pred_lstm))

# Attention mechanism
attention_layer = Dense(1, activation='tanh')(concatenated_predictions)
attention_weight = Dense(1, activation='softmax')(attention_layer)
weighted_predictions = np.column_stack((y_pred_lr * attention_weight[:, 0], y_pred_lstm * (1 - attention_weight[:, 0])))
ensemble_pred = np.sum(weighted_predictions, axis=1)

# Convert predictions to binary
ensemble_pred_binary = (ensemble_pred > 0.5).astype("int32")

# Print shapes for debugging
print("Shape of ensemble_pred:", ensemble_pred.shape)
print("Shape of ensemble_pred_binary:", ensemble_pred_binary.shape)

# Calculate accuracy and additional evaluation metrics of the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_pred_binary)
roc_auc = roc_auc_score(y_test, ensemble_pred)
precision, recall, f1_score, _ = classification_report(y_test, ensemble_pred_binary, output_dict=True)['1'].values()

print("Confusion matrix:\n", confusion_matrix(y_test, ensemble_pred_binary))

# Store the result of the confusion matrix in a variable with a different name
conf_matrix = confusion_matrix(y_test, ensemble_pred_binary)
print("Confusion matrix:\n", conf_matrix)

# Access elements of the confusion matrix using the new variable name
TN = conf_matrix[0][0]
FP = conf_matrix[0][1]
FN = conf_matrix[1][0]
TP = conf_matrix[1][1]


sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)

print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")

# Print evaluation metrics
print("Ensemble Accuracy:", ensemble_accuracy)
print("ROC AUC:", roc_auc)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Classification report:\n", classification_report(y_test, ensemble_pred_binary))
print("AUC:", roc_auc)

     against_bug  against_dark  against_dragon  against_electric  \
0       1.000000      1.000000        1.000000          1.000000   
1       1.000000      1.000000        1.000000          1.000000   
2       1.000000      1.000000        1.000000          1.000000   
3       2.000000      0.500000        1.000000          1.000000   
4       0.500000      1.000000        1.000000          2.000000   
..           ...           ...             ...               ...   
856     0.500000      0.500000        0.000000          1.873853   
857     0.989650      0.989650        0.326434          1.673566   
858     1.339766      1.339766        0.559844          1.440156   
859     0.557922      0.557922        0.000000          1.884156   
860     0.688348      0.688348        0.000000          1.623305   

     against_fairy  against_fight  against_fire  against_flying  \
0              1.0       2.000000      1.000000        1.000000   
1              1.0       2.000000      2.000000  

  super().__init__(**kwargs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Shape of y_test: (285,)
Shape of y_pred_lr: (285,)
Shape of y_pred_lstm: (285,)
Shape of ensemble_pred: (285,)
Shape of ensemble_pred_binary: (285,)
Confusion matrix:
 [[ 34   2]
 [  3 246]]
Confusion matrix:
 [[ 34   2]
 [  3 246]]
Sensitivity: 0.9880
Specificity: 0.9444
Ensemble Accuracy: 0.9824561403508771
ROC AUC: 0.9991075412762159
Precision: 0.9919354838709677
Recall: 0.9879518072289156
F1 Score: 0.9899396378269618
Classification report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93        36
           1       0.99      0.99      0.99       249

    accuracy                           0.98       285
   macro avg       0.96      0.97      0.96       285
weighted avg       0.98      0.98      0.98       285

AUC: 0.9991075412762159


