In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import joblib

# Load your dataset
df = pd.read_csv('drugs.csv', nrows=200000)

# Drop rows with any missing values
df.dropna(inplace=True)

# Ensure ratings are numeric and filter out invalid entries
df = df[pd.to_numeric(df['Rating'], errors='coerce').notnull()]
df.loc[:, 'Rating'] = df['Rating'].astype(float)

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Content']).toarray()

# Target variable
y = df['Rating'].values

# Transform ratings to categorical labels for classification
y = np.where(y >= 6, 1, 0)  # Example: ratings >= 6 are considered positive (1), others are negative (0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load pre-trained Keras model
mlp_model = load_model('mlp_model.h5')

# Wrap Keras model for use in scikit-learn
mlp_wrapper = KerasClassifier(model=mlp_model)

# Load pre-trained Naive Bayes model
nb_model = joblib.load('naive_bayes_model.joblib')

# Define meta-model with hyperparameters for tuning
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'solver': ['lbfgs', 'liblinear']  # Optimization algorithm
}

meta_model = LogisticRegression()

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=meta_model, param_grid=param_grid, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

# Get best parameters
best_params = grid_search.best_params_

# Display best parameters
print("Best Parameters:", best_params)

# Update meta-model with best parameters
meta_model.set_params(**best_params)

# Create stacking ensemble
estimators = [('mlp', mlp_wrapper), ('nb', nb_model)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=meta_model)

# Train stacking model
stacking_model.fit(X_train, y_train)

# Save the stacking model
joblib.dump(stacking_model, 'stacking_model.joblib')

# Predict on the test set with the stacking model
y_pred_stacking = stacking_model.predict(X_test)

# Calculate metrics for the stacking model
conf_matrix = confusion_matrix(y_test, y_pred_stacking)
accuracy = accuracy_score(y_test, y_pred_stacking)
precision = precision_score(y_test, y_pred_stacking)
recall = recall_score(y_test, y_pred_stacking)
f1 = f1_score(y_test, y_pred_stacking)

# Display metrics
print("Confusion Matrix (Stacking):")
print(conf_matrix)
print("\nMetrics (Stacking):"   )
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)




Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] END ................................C=0.1, solver=lbfgs; total time=  24.6s
[CV] END ................................C=0.1, solver=lbfgs; total time=  25.5s
[CV] END ................................C=0.1, solver=lbfgs; total time=  26.0s
[CV] END ............................C=0.1, solver=liblinear; total time=  11.3s
[CV] END ............................C=0.1, solver=liblinear; total time=   6.5s
[CV] END ............................C=0.1, solver=liblinear; total time=   5.6s
[CV] END ..................................C=1, solver=lbfgs; total time=  37.1s
[CV] END ..................................C=1, solver=lbfgs; total time=  36.7s
[CV] END ..................................C=1, solver=lbfgs; total time=  47.9s
[CV] END ..............................C=1, solver=liblinear; total time=  10.4s
[CV] END ..............................C=1, solver=liblinear; total time=   8.2s
[CV] END ..............................C=1, solve

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .................................C=10, solver=lbfgs; total time= 1.1min
[CV] END .................................C=10, solver=lbfgs; total time=  39.7s
[CV] END .............................C=10, solver=liblinear; total time=  11.9s
[CV] END .............................C=10, solver=liblinear; total time=   8.4s
[CV] END .............................C=10, solver=liblinear; total time=   7.6s
Best Parameters: {'C': 1, 'solver': 'lbfgs'}


  saveable.load_own_variables(weights_store.get(inner_path))


[1m4628/4628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 5ms/step - accuracy: 0.7144 - loss: 1.5009


  saveable.load_own_variables(weights_store.get(inner_path))


[1m3703/3703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - accuracy: 0.6959 - loss: 1.7420
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m3703/3703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4ms/step - accuracy: 0.6982 - loss: 1.7375
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
[1m3703/3703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - accuracy: 0.6951 - loss: 1.7598
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m3703/3703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - accuracy: 0.6855 - loss: 1.7924
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m3703/3703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - accuracy: 0.6961 - loss: 1.7449
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m1157/1157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[