### GloVe Embeddings

In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences





In [2]:
# Download GloVe Embeddings and Load GloVe Embeddings
glove_file_path = "D:/SIBMB/SEM 4/DL/WORD2VEC/glove.6B/glove.6B.300d.txt"
embeddings_index = {}
with open(glove_file_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [3]:
# Read the CSV file
df = pd.read_csv("D:/SIBMB/SEM 4/DL/PROJECT/Disaster Dataset/train.csv", encoding='latin-1')


In [4]:
# Tokenizing and padding the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(df['text'])
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')


In [5]:
# Creating an embedding matrix using GloVe
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [6]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, pd.get_dummies(df['target']).values, test_size=0.2, random_state=42)


In [7]:
# Building the ANN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Flatten())
model.add(Dense(units=6, activation='relu'))
model.add(Dense(units=len(df['target'].unique()), activation='softmax'))





In [8]:
# Compiling the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])





In [9]:
# Training the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))



Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1882215da90>

In [10]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
# Print the accuracy
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 75.64%


### FastText

In [11]:
from gensim.models import KeyedVectors
from smart_open import open

# Specify the path to the FastText .vec file
fasttext_file_path = "D:/SIBMB/SEM 4/DL/WORD2VEC/FastText/wiki-news-300d-1M.vec"

# Load FastText Embeddings using Gensim and smart_open
with open(fasttext_file_path, 'rb') as f:
    fasttext_model = KeyedVectors.load_word2vec_format(f, binary=False)

In [12]:
# Tokenizing and padding the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(df['text'])
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [13]:
# Creating an embedding matrix using FastText
EMBEDDING_DIM = 300  # Adjust based on the dimension of your FastText embeddings
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]

In [14]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, pd.get_dummies(df['target']).values, test_size=0.2, random_state=42)


In [15]:
# Building the ANN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Flatten())
model.add(Dense(units=6, activation='relu'))
model.add(Dense(units=len(df['target'].unique()), activation='softmax'))


In [16]:
# Compiling the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [17]:
# Training the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x18861eefed0>

In [18]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)

# Print the accuracy
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 74.20%
