In [21]:
"""Step1: Data Cleaning and Preprocessing"""

import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
df = pd.read_csv("train.csv")
print("Original dataset shape:", df.shape)

# Check for missing values
print(df.isnull().sum())

# Fill missing values in 'keyword' or 'location'
df['text'] = df['text'].fillna("")

# Clean the tweet text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\@w+|\#','', text)   # Remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = text.strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

# Tokenization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])

# Padding
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

# Labels
labels = df['target'].values

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Original dataset shape: (7613, 5)
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
Training data shape: (6090, 100)
Testing data shape: (1523, 100)


In [22]:
"""Step 2: Baseline Neural Network with Manual Tuning"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam

# Define hyperparameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 16
max_length = 100
dropout_rate = 0.5
dense_units = 24
learning_rate = 0.001
batch_size = 32
epochs = 10

# Build the baseline model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(dense_units, activation='relu'),
    Dropout(dropout_rate),
    Dense(1, activation='sigmoid')
])

# Compile the model
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                    validation_data=(X_test, y_test), verbose=2)

# Evaluate on test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.4f}")

Epoch 1/10




191/191 - 2s - 10ms/step - accuracy: 0.5672 - loss: 0.6828 - val_accuracy: 0.5739 - val_loss: 0.6822
Epoch 2/10
191/191 - 1s - 3ms/step - accuracy: 0.5706 - loss: 0.6795 - val_accuracy: 0.5739 - val_loss: 0.6743
Epoch 3/10
191/191 - 1s - 4ms/step - accuracy: 0.5798 - loss: 0.6687 - val_accuracy: 0.6369 - val_loss: 0.6580
Epoch 4/10
191/191 - 1s - 4ms/step - accuracy: 0.6695 - loss: 0.6346 - val_accuracy: 0.6428 - val_loss: 0.6122
Epoch 5/10
191/191 - 1s - 4ms/step - accuracy: 0.7447 - loss: 0.5634 - val_accuracy: 0.7498 - val_loss: 0.5463
Epoch 6/10
191/191 - 1s - 4ms/step - accuracy: 0.7898 - loss: 0.4917 - val_accuracy: 0.7827 - val_loss: 0.5040
Epoch 7/10
191/191 - 1s - 4ms/step - accuracy: 0.8217 - loss: 0.4329 - val_accuracy: 0.7991 - val_loss: 0.4706
Epoch 8/10
191/191 - 1s - 4ms/step - accuracy: 0.8394 - loss: 0.3969 - val_accuracy: 0.8024 - val_loss: 0.4566
Epoch 9/10
191/191 - 1s - 4ms/step - accuracy: 0.8501 - loss: 0.3697 - val_accuracy: 0.7853 - val_loss: 0.4887
Epoch 10/10

In [23]:
"""Step 3: PSO-Optimized	Neural	Network	Implementation"""

import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Dataset split
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# PSO CONFIGURATION
SWARM_SIZE = 10
DIMENSIONS = 3  # [learning_rate, dense_units, dropout_rate]
INFORMANTS = 3
NUM_GENERATIONS = 10
W = 0.729
C1 = 1.49
C2 = 1.49
desired_precision = 1e-5

# BOUNDS for [learning_rate, dense_units, dropout_rate]
MIN_BOUNDARY = [0.0005, 16, 0.2]
MAX_BOUNDARY = [0.01, 128, 0.6]

# Fitness function (returns 1 - accuracy, as we want to minimize)
def fitness_function(position):
    lr = position[0]
    dense_units = int(position[1])
    dropout = float(position[2])
    
    model = Sequential([
        Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=48, input_length=100),
        GlobalAveragePooling1D(),
        Dense(dense_units, activation='relu'),
        Dropout(dropout),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=Adam(learning_rate=lr), loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train_split, y_train_split, epochs=5, batch_size=32,
                        validation_data=(X_val, y_val), verbose=0)
    acc = history.history['val_accuracy'][-1]
    return 1 - acc  # minimize error

# Particle class
class Particle:
    def __init__(self):
        self.position = [
            random.uniform(MIN_BOUNDARY[0], MAX_BOUNDARY[0]),
            random.uniform(MIN_BOUNDARY[1], MAX_BOUNDARY[1]),
            random.uniform(MIN_BOUNDARY[2], MAX_BOUNDARY[2])
        ]
        self.velocity = [random.uniform(-1, 1) for _ in range(DIMENSIONS)]
        self.fitness = fitness_function(self.position)
        self.best_position = list(self.position)
        self.best_fitness = self.fitness
        self.informants = random.sample(range(SWARM_SIZE), INFORMANTS)
        self.group_best_position = list(self.position)
        self.group_best_fitness = self.fitness

    def update_velocity(self):
        for d in range(DIMENSIONS):
            r1, r2 = random.random(), random.random()
            cognitive = C1 * r1 * (self.best_position[d] - self.position[d])
            social = C2 * r2 * (self.group_best_position[d] - self.position[d])
            self.velocity[d] = W * self.velocity[d] + cognitive + social

    def update_position(self):
        for d in range(DIMENSIONS):
            self.position[d] += self.velocity[d]
            self.position[d] = max(MIN_BOUNDARY[d], min(MAX_BOUNDARY[d], self.position[d]))
        self.fitness = fitness_function(self.position)

    def update_group_best(self, swarm):
        best_informant = min(self.informants, key=lambda i: swarm[i].best_fitness)
        if swarm[best_informant].best_fitness < self.group_best_fitness:
            self.group_best_fitness = swarm[best_informant].best_fitness
            self.group_best_position = list(swarm[best_informant].best_position)

# Initialize swarm
swarm = [Particle() for _ in range(SWARM_SIZE)]
global_best = min(swarm, key=lambda p: p.best_fitness)
global_best_position = list(global_best.best_position)
global_best_fitness = global_best.best_fitness

# PSO loop
for gen in range(NUM_GENERATIONS):
    for particle in swarm:
        particle.update_group_best(swarm)
        particle.update_velocity()
        particle.update_position()
        if particle.fitness < particle.best_fitness:
            particle.best_fitness = particle.fitness
            particle.best_position = list(particle.position)
    best_particle = min(swarm, key=lambda p: p.best_fitness)
    if best_particle.best_fitness < global_best_fitness:
        global_best_fitness = best_particle.best_fitness
        global_best_position = list(best_particle.best_position)

    print(f"Generation {gen+1}: Best Accuracy = {1 - global_best_fitness:.4f}")

    if global_best_fitness < desired_precision:
        print("Desired precision reached.")
        break

print("\nOptimization Complete!")
print(f"Best Learning Rate: {global_best_position[0]:.5f}")
print(f"Best Dense Units: {int(global_best_position[1])}")
print(f"Best Dropout Rate: {global_best_position[2]:.4f}")
print(f"Validation Accuracy: {1 - global_best_fitness:.4f}")


Generation 1: Best Accuracy = 0.8021
Generation 2: Best Accuracy = 0.8021
Generation 3: Best Accuracy = 0.8071
Generation 4: Best Accuracy = 0.8103
Generation 5: Best Accuracy = 0.8103
Generation 6: Best Accuracy = 0.8103
Generation 7: Best Accuracy = 0.8103
Generation 8: Best Accuracy = 0.8112
Generation 9: Best Accuracy = 0.8112
Generation 10: Best Accuracy = 0.8112

Optimization Complete!
Best Learning Rate: 0.01000
Best Dense Units: 55
Best Dropout Rate: 0.4466
Validation Accuracy: 0.8112


In [27]:
"""Step 4: Performance	Comparison	and	Analysis"""

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from scipy.stats import ttest_rel

# Reassign the baseline model
baseline_model = model  # this was already trained earlier

# Rebuild the best PSO-optimized model from global_best_position
best_embedding_dim = 48  # fixed in PSO
best_dense_units = int(global_best_position[1])
best_dropout = float(global_best_position[2])
best_lr = float(global_best_position[0])

best_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=best_embedding_dim, input_length=100),
    GlobalAveragePooling1D(),
    Dense(best_dense_units, activation='relu'),
    Dropout(best_dropout),
    Dense(1, activation='sigmoid')
])

best_model.compile(optimizer=Adam(learning_rate=best_lr),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Train the PSO model on full training data
best_model.fit(X_train, y_train, epochs=10, batch_size=32,
               validation_data=(X_test, y_test), verbose=0)

# Predictions
baseline_probs = baseline_model.predict(X_test)
pso_probs = best_model.predict(X_test)

y_pred_baseline = (baseline_probs > 0.5).astype(int).flatten()
y_pred_pso = (pso_probs > 0.5).astype(int).flatten()

# Accuracy comparison
acc_baseline = accuracy_score(y_test, y_pred_baseline)
acc_pso = accuracy_score(y_test, y_pred_pso)

print(" Model Accuracy Comparison:")
print(f"Baseline Model Accuracy     : {acc_baseline:.4f}")
print(f"PSO-Optimized Model Accuracy: {acc_pso:.4f}")

# Paired t-test
t_stat, p_val = ttest_rel(y_pred_pso, y_pred_baseline)
print("\n Paired t-test Results:")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value    : {p_val:.5f}")

if p_val < 0.05:
    print(" Difference is statistically significant — PSO model performs significantly better!")
else:
    print(" No statistically significant difference — results are close.")




[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
 Model Accuracy Comparison:
Baseline Model Accuracy     : 0.7997
PSO-Optimized Model Accuracy: 0.8089

 Paired t-test Results:
t-statistic: 9.8123
p-value    : 0.00000
 Difference is statistically significant — PSO model performs significantly better!
