In [22]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models

# Read the CSV file into a DataFrame
df = pd.read_csv('/content/drive/MyDrive/train.csv')

# Print the first few rows of the DataFrame (head)
print("First few rows of the DataFrame:")
print(df.head())

# Print the shape of the DataFrame
print("\nDataFrame shape:")
print(df.shape)

# Print the number of entries labeled as 'Disaster' (target == 1)
print("\nNumber of entries labeled as 'Disaster':")
print((df.target == 1).sum())

# Print the number of entries labeled as 'No Disaster' (target == 0)
print("\nNumber of entries labeled as 'No Disaster':")
print((df.target == 0).sum())

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

# Convert text in 'text' column to lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

# Define preprocessing functions
def remove_URL(text):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

def remove_punct(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.isalnum()]
    return ' '.join(filtered_words)

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    return ' '.join(filtered_words)

# Apply preprocessing
df["text"] = df.text.map(remove_URL)
df["text"] = df.text.map(remove_punct)
df["text"] = df.text.map(remove_stopwords)

# Function to count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

counter = counter_word(df['text'])

print("")
print("Total number of unique words:", len(counter))
print("\nMost Common words:", counter.most_common(5))

# Define features and labels
X = df['text'].values
y = df['target'].values

# Split data
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    X, y, test_size=0.2, random_state=42)
print("\nTrain and Test sets are:")
print(train_sentences.shape, val_sentences.shape)

# Tokenizer setup
num_unique_words = len(counter)
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)

word_index = tokenizer.word_index
print("\nSample of word index dictionary (first 10):")
print(dict(list(word_index.items())[:10]))

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

# Padding
max_length = 20
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")

print("\nShape of padded training sequences:", train_padded.shape)
print("Shape of padded validation sequences:", val_padded.shape)

# Reverse word index for decoding
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

def decode(sequence):
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

print("\nExample original sentence:")
print(train_sentences[10])
print("\nSequence representation:")
print(train_sequences[10])
print("\nPadded sequence:")
print(train_padded[10])
print("\nDecoded back:")
print(decode(train_sequences[10]))

# Build the model
model = models.Sequential()
model.add(layers.Embedding(input_dim=num_unique_words, output_dim=32, input_shape=(max_length,)))
model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

# Compile the model
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ["accuracy"]
model.compile(loss=loss, optimizer=optim, metrics=metrics)

# Train the model
model.fit(
    train_padded,
    train_labels,
    epochs=20,
    validation_data=(val_padded, val_labels),
    verbose=2
)


# Make predictions using the trained model on the padded training sequences
predictions = model.predict(train_padded)

# Convert predicted probabilities to binary labels using a threshold of 0.5
predictions = [1 if p > 0.5 else 0 for p in predictions]

First few rows of the DataFrame:
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  

DataFrame shape:
(7613, 5)

Number of entries labeled as 'Disaster':
3271

Number of entries labeled as 'No Disaster':
4342


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Total number of unique words: 15950

Most Common words: [('like', 346), ('amp', 344), ('fire', 249), ('get', 228), ('new', 223)]

Train and Test sets are:
(6090,) (1523,)

Sample of word index dictionary (first 10):
{'like': 1, 'amp': 2, 'fire': 3, 'get': 4, 'via': 5, 'new': 6, 'people': 7, 'news': 8, 'one': 9, 'video': 10}

Shape of padded training sequences: (6090, 20)
Shape of padded validation sequences: (1523, 20)

Example original sentence:
one direction pick fan army directioners x1402

Sequence representation:
[9, 992, 653, 536, 101, 1606, 5375]

Padded sequence:
[   9  992  653  536  101 1606 5375    0    0    0    0    0    0    0
    0    0    0    0    0    0]

Decoded back:
one direction pick fan army directioners x1402


  super().__init__(**kwargs)


Epoch 1/20
191/191 - 7s - 37ms/step - accuracy: 0.7061 - loss: 0.5552 - val_accuracy: 0.7912 - val_loss: 0.4618
Epoch 2/20
191/191 - 5s - 24ms/step - accuracy: 0.8783 - loss: 0.3075 - val_accuracy: 0.7965 - val_loss: 0.4879
Epoch 3/20
191/191 - 4s - 22ms/step - accuracy: 0.9386 - loss: 0.1784 - val_accuracy: 0.7859 - val_loss: 0.5193
Epoch 4/20
191/191 - 4s - 22ms/step - accuracy: 0.9619 - loss: 0.1270 - val_accuracy: 0.7768 - val_loss: 0.6821
Epoch 5/20
191/191 - 5s - 25ms/step - accuracy: 0.9691 - loss: 0.0992 - val_accuracy: 0.7768 - val_loss: 0.7209
Epoch 6/20
191/191 - 7s - 36ms/step - accuracy: 0.9752 - loss: 0.0843 - val_accuracy: 0.7748 - val_loss: 0.6546
Epoch 7/20
191/191 - 5s - 27ms/step - accuracy: 0.9767 - loss: 0.0702 - val_accuracy: 0.7669 - val_loss: 0.7837
Epoch 8/20
191/191 - 3s - 18ms/step - accuracy: 0.9782 - loss: 0.0572 - val_accuracy: 0.7689 - val_loss: 1.1326
Epoch 9/20
191/191 - 4s - 22ms/step - accuracy: 0.9806 - loss: 0.0475 - val_accuracy: 0.7630 - val_loss: