#Neural Network Classification - Multi-Label

In [None]:
#Mount the google drive connection to our dataset
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Tensorflow Notes/datasets/mLabel_tweets.csv')

In [None]:
df.head()

Unnamed: 0,ID,tweet,labels
0,1296010336907038720t,@cath__kath AstraZeneca is made with the kidne...,ingredients
1,1336808189677940736t,It begins. Please find safe alternatives to th...,side-effect
2,1329488407307956231t,"@PaolaQP1231 Well, I mean congratulations Covi...",side-effect
3,1364194604459900934t,@BorisJohnson for those of us that do not wish...,mandatory
4,1375938799247765515t,She has been trying to speak out: writing lett...,side-effect rushed


In [None]:
df.drop("ID", axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,tweet,labels
0,@cath__kath AstraZeneca is made with the kidne...,ingredients
1,It begins. Please find safe alternatives to th...,side-effect
2,"@PaolaQP1231 Well, I mean congratulations Covi...",side-effect
3,@BorisJohnson for those of us that do not wish...,mandatory
4,She has been trying to speak out: writing lett...,side-effect rushed


In [None]:
df.shape

(9921, 2)

In [None]:
df["labels"].unique()

array(['ingredients', 'side-effect', 'mandatory', 'side-effect rushed',
       'ineffective mandatory', 'political', 'side-effect ineffective',
       'none', 'conspiracy', 'country', 'pharma', 'ineffective',
       'pharma ineffective', 'side-effect pharma', 'rushed',
       'side-effect mandatory political', 'unnecessary ineffective',
       'rushed mandatory side-effect', 'mandatory pharma',
       'unnecessary rushed mandatory', 'unnecessary',
       'pharma political unnecessary', 'rushed side-effect',
       'ingredients pharma side-effect', 'mandatory ineffective',
       'unnecessary mandatory', 'mandatory unnecessary',
       'ineffective side-effect', 'ineffective side-effect rushed',
       'side-effect unnecessary', 'ineffective rushed',
       'political rushed', 'religious', 'rushed political',
       'mandatory political', 'side-effect ingredients conspiracy',
       'pharma conspiracy', 'rushed unnecessary', 'pharma side-effect',
       'mandatory conspiracy', 'side-eff

In [None]:
df['labels'] = df['labels'].apply(lambda x: set(x.split()))

In [None]:
df['labels']

0               {ingredients}
1               {side-effect}
2               {side-effect}
3                 {mandatory}
4       {rushed, side-effect}
                ...          
9916            {side-effect}
9917                 {pharma}
9918                   {none}
9919            {side-effect}
9920              {political}
Name: labels, Length: 9921, dtype: object

#Let's train a Machine Learning Model for Multi-Label Classification

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

# Use MultiLabelBinarizer to create binary label indicators
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['tweet'])

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Multi-label classification model using RandomForestClassifier
classifier = MultiOutputClassifier(RandomForestClassifier(n_estimators=100))
classifier.fit(X_train, y_train)

# Evaluation
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

  conspiracy       0.57      0.04      0.08        91
     country       1.00      0.05      0.10        39
 ineffective       0.75      0.44      0.56       342
 ingredients       0.87      0.29      0.43       114
   mandatory       0.84      0.54      0.65       155
        none       0.38      0.09      0.14       115
      pharma       0.70      0.36      0.48       258
   political       0.72      0.12      0.20       110
   religious       0.00      0.00      0.00        13
      rushed       0.81      0.52      0.63       298
 side-effect       0.82      0.74      0.78       764
 unnecessary       0.69      0.16      0.26       160

   micro avg       0.79      0.46      0.58      2459
   macro avg       0.68      0.28      0.36      2459
weighted avg       0.76      0.46      0.54      2459
 samples avg       0.50      0.48      0.48      2459



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Let's train a Neural Network for multi-Label Classification:

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization and Padding
max_words = 1000
maxlen = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['tweet'])
X = tokenizer.texts_to_sequences(df['tweet'])
X = pad_sequences(X, maxlen=maxlen)

# Multi-label binary encoding
label_classes = np.unique([label for sublist in df['labels'] for label in sublist])
y = np.zeros((len(df), len(label_classes)), dtype=int)
for i, labels in enumerate(df['labels']):
    for label in labels:
        y[i, np.where(label_classes == label)] = 1

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model definition
model = Sequential()
model.add(Embedding(max_words, 50, input_length=maxlen))
model.add(LSTM(64))
model.add(Dense(len(label_classes), activation='softmax'))  # Softmax for multi-class classification

# Compile the model with categorical crossentropy loss
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.1)

# Evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.38992443680763245


In [None]:
# Model definition
model2 = Sequential()
model2.add(Embedding(max_words, 50, input_length=maxlen))
model2.add(LSTM(64))
model2.add(Dense(len(label_classes), activation='softmax'))  # Softmax for multi-class classification

# Compile the model with categorical crossentropy loss
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training the model
model2.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.1)

# Evaluation
loss, accuracy = model2.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.35818639397621155


#Let's imp. Early Stopping:

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.activations import sigmoid
from tensorflow.keras.callbacks import EarlyStopping

# Model definition
model3 = Sequential()
model3.add(Embedding(max_words, 50, input_length=maxlen))
model3.add(LSTM(64))
model3.add(Dense(len(label_classes), activation='sigmoid'))  # Using 'sigmoid' activation for binary classification

# Compile the model with binary crossentropy loss for logistic activation
model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Define early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Training the model with early stopping
model3.fit(X_train, y_train, epochs=30, batch_size=16, validation_split=0.1, callbacks=[early_stopping])

# Evaluation
loss, accuracy = model3.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Test Accuracy: 0.36423173546791077


In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Assuming 'df' is your DataFrame containing 'tweet' and 'labels' columns
# Replace 'df' with your actual DataFrame name

# Use MultiLabelBinarizer to create binary label indicators
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['tweet'])

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the base classifier (RandomForestClassifier)
base_classifier = RandomForestClassifier(random_state=42)

# Set up parameter grid for GridSearchCV
param_grid = {
    'estimator__n_estimators': [100, 200],  # Number of trees in the forest
    'estimator__max_depth': [None, 10, 20],  # Maximum depth of each tree
    'estimator__min_samples_split': [2, 5, 10]  # Minimum number of samples required to split a node
}

# Initialize MultiOutputClassifier with base classifier
classifier = MultiOutputClassifier(base_classifier)

# Initialize GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, cv=3, verbose=2)

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_classifier = grid_search.best_estimator_

# Evaluation
y_pred = best_classifier.predict(X_test)

# Inverse transform to get labels back from binary indicators
y_pred_labels = mlb.inverse_transform(y_pred)
y_test_labels = mlb.inverse_transform(y_test)

# Print best parameters and classification report
print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

# Optionally, print some example predictions
for i in range(5):  # Adjust number of examples as needed
    print(f"Example {i+1}:")
    print(f"Predicted Labels: {y_pred_labels[i]}")
    print(f"True Labels: {y_test_labels[i]}")
    print()

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  27.8s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  26.0s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=100; total time=  26.6s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=200; total time=  54.4s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=200; total time=  52.2s
[CV] END estimator__max_depth=None, estimator__min_samples_split=2, estimator__n_estimators=200; total time=  52.9s
[CV] END estimator__max_depth=None, estimator__min_samples_split=5, estimator__n_estimators=100; total time=  24.3s
[CV] END estimator__max_depth=None, estimator__min_samples_split=5, estimator__n_estimators=100; total time=  23.3s
[CV] END es

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
