In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_text(text):
    lemma = nltk.WordNetLemmatizer()
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    text = ' '.join([lemma.lemmatize(word) for word in text.split() if word not in nltk.corpus.stopwords.words('english')])
    return text

# Load dataset
df = pd.read_csv('trainDisaster.csv')

# Fill missing values
df['text'] = df['text'].apply(preprocess_text)
df['keyword'] = df['keyword'].fillna('unknown')
df['location'] = df['location'].fillna('unknown')


print(df['target'].value_counts())

df['target'] = df['target'].astype(int)

# Prepare tokenizer and word sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index


X = tokenizer.texts_to_sequences(df['text'])
y = df['target']

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(enumerate(class_weights))
print("Class weights:", class_weights)


max_len = 30
X = pad_sequences(X, maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Embedding(input_dim=len(word_index)+1, output_dim=100))
model.add(LSTM(200, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


earlystopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2,  callbacks=[earlystopping])

# Save the trained model
# model.save('disaster_rnn_model.h5')

def preprocess_input(news):
    words = news.lower().split()
    encoded_review = [word_index.get(word, 2) for word in words]  # Using '2' for out-of-vocabulary words
    padded_review = pad_sequences([encoded_review], maxlen=max_len)
    return padded_review

def predict_news(news):
    preprocessed_text = preprocess_input(news)
    prediction = model.predict(preprocessed_text)
    sentiment = 'DisasterRelated' if prediction[0][0] > 0.5 else 'Not Related'
    return sentiment, prediction[0][0]


news = "earthquake shakes the city, people are trapped"
result, score = predict_news(news)
print(f'Result: {result}')
print(f'Score: {score}')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\negia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\negia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    4342
1    3271
Name: target, dtype: int64
Class weights: {0: 0.8766697374481806, 1: 1.1637114032405993}
Epoch 1/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.6109 - loss: 0.6495 - val_accuracy: 0.7479 - val_loss: 0.5088
Epoch 2/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.8692 - loss: 0.3653 - val_accuracy: 0.7898 - val_loss: 0.4941
Epoch 3/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.9389 - loss: 0.2316 - val_accuracy: 0.7775 - val_loss: 0.6260
Epoch 4/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step - accuracy: 0.9756 - loss: 0.0898 - val_accuracy: 0.7750 - val_loss: 0.9435
Epoch 5/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 39ms/step - accuracy: 0.9870 - loss: 0.0430 - val_accuracy: 0.7586 - val_loss: 1.1285
Epoch 6/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0

In [27]:
news = "In 2022, flooding and landslides in the northeastern state of Assam killed at least 192 people|"
result, score = predict_news(news)
print(f'Result: {result}')
print(f'Score: {score}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Result: DisasterRelated
Score: 0.960098147392273


In [18]:
!pip install redis

Collecting redis
  Downloading redis-5.2.0-py3-none-any.whl.metadata (9.1 kB)
Downloading redis-5.2.0-py3-none-any.whl (261 kB)
   ---------------------------------------- 0.0/261.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/261.4 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/261.4 kB 445.2 kB/s eta 0:00:01
   ------------------ --------------------- 122.9/261.4 kB 1.2 MB/s eta 0:00:01
   ---------------------------------------  256.0/261.4 kB 2.0 MB/s eta 0:00:01
   ---------------------------------------  256.0/261.4 kB 2.0 MB/s eta 0:00:01
   ---------------------------------------- 261.4/261.4 kB 1.3 MB/s eta 0:00:00
Installing collected packages: redis
Successfully installed redis-5.2.0


In [3]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.utils.class_weight import compute_class_weight
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.utils import pad_sequences
# from tensorflow.keras.callbacks import EarlyStopping
# import nltk
# import re

# # Ensure necessary NLTK resources are downloaded
# nltk.download('stopwords')
# nltk.download('wordnet')

# # Preprocessing function
# def preprocess_text(text):
#     lemma = nltk.WordNetLemmatizer()
#     text = re.sub('[^a-zA-Z]', ' ', text).lower()
#     text = ' '.join([lemma.lemmatize(word) for word in text.split() if word not in nltk.corpus.stopwords.words('english')])
#     return text

# # Load dataset
# df = pd.read_csv('trainDisaster.csv')

# # Fill missing values
# df['text'] = df['text'].apply(preprocess_text)
# df['keyword'] = df['keyword'].fillna('unknown')
# df['location'] = df['location'].fillna('unknown')

# # Check target distribution
# print("Target distribution:")
# print(df['target'].value_counts())

# # Convert target to integers
# df['target'] = df['target'].astype(int)

# # Prepare tokenizer and word sequences
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(df['text'])
# word_index = tokenizer.word_index

# # Prepare input data (X) and target data (y)
# X = tokenizer.texts_to_sequences(df['text'])
# y = df['target']

# # Compute class weights
# class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
# class_weights = dict(enumerate(class_weights))
# print("Class weights:", class_weights)

# # Pad sequences
# max_len = 30
# X = pad_sequences(X, maxlen=max_len)

# # Split dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Build the RNN model with LSTM layers
# model = Sequential()
# model.add(Embedding(input_dim=len(word_index)+1, output_dim=200)) 
# model.build(input_shape=(None, max_len))# Using 200-dimensional embeddings
# model.add(LSTM(256, return_sequences=True))  # First LSTM layer with 256 units
# model.add(Dropout(0.3))  # Dropout to prevent overfitting
# model.add(LSTM(128))  # Second LSTM layer with 128 units
# model.add(Dropout(0.3))  # Additional Dropout layer
# model.add(Dense(1, activation='sigmoid'))  # Binary classification with sigmoid activation
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Binary loss and optimizer

# # Display model summary
# model.summary()

# # Early stopping callback
# earlystopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# # Train the model with class weights
# model.fit(X_train, y_train, 
#           epochs=20, 
#           batch_size=32, 
#           validation_split=0.2, 
# #           class_weight=class_weights, 
#           callbacks=[earlystopping])

# # Save the trained model
# # model.save('disaster_rnn_model.h5')

# # Function to preprocess input news
# def preprocess_input(news):
#     words = news.lower().split()
#     encoded_review = [word_index.get(word, 2) for word in words]  # Using '2' for out-of-vocabulary words
#     return pad_sequences([encoded_review], maxlen=max_len)

# # Function to predict disaster-related news
# def predict_news(news):
#     preprocessed_text = preprocess_input(news)
#     prediction = model.predict(preprocessed_text)
#     sentiment = 'DisasterRelated' if prediction[0][0] > 0.5 else 'Not Related'
#     return sentiment, prediction[0][0]

# # Example usage
# news = "Earthquake shakes the city, people are trapped"
# result, score = predict_news(news)
# print(f'Result: {result}')
# print(f'Score: {score:.2f}')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\negia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\negia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Target distribution:
0    4342
1    3271
Name: target, dtype: int64
Class weights: {0: 0.8766697374481806, 1: 1.1637114032405993}


Epoch 1/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 89ms/step - accuracy: 0.6621 - loss: 0.6095 - val_accuracy: 0.7923 - val_loss: 0.4498
Epoch 2/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 86ms/step - accuracy: 0.9072 - loss: 0.2408 - val_accuracy: 0.7775 - val_loss: 0.5796
Epoch 3/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 84ms/step - accuracy: 0.9755 - loss: 0.0773 - val_accuracy: 0.7644 - val_loss: 0.8183
Epoch 4/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 86ms/step - accuracy: 0.9863 - loss: 0.0475 - val_accuracy: 0.7471 - val_loss: 0.8924
Epoch 5/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 85ms/step - accuracy: 0.9940 - loss: 0.0247 - val_accuracy: 0.7521 - val_loss: 1.0152
Epoch 6/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 91ms/step - accuracy: 0.9935 - loss: 0.0207 - val_accuracy: 0.7529 - val_loss: 1.1461
[1m1/1[0m [32

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')


def preprocess_text(text):
    lemma = nltk.WordNetLemmatizer()
    text = re.sub('[^a-zA-Z]', ' ', text).lower()
    text = ' '.join([lemma.lemmatize(word) for word in text.split() if word not in nltk.corpus.stopwords.words('english')])
    return text


df = pd.read_csv('trainDisaster.csv')


df['text'] = df['text'].fillna('').apply(preprocess_text)
df['keyword'] = df['keyword'].fillna('unknown')
df['location'] = df['location'].fillna('unknown')


df['target'] = df['target'].astype(int)

# Tokenizer setup
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index


X = tokenizer.texts_to_sequences(df['text'])
y = df['target']


class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights = dict(enumerate(class_weights))


max_len = 50
X = pad_sequences(X, maxlen=max_len)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=100, input_length=max_len),
    LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    LSTM(64, kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


earlystopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5)

model.fit(
    X_train, y_train,
    epochs=25,
    batch_size=32,
    validation_split=0.2,
#     class_weight=class_weights,
    callbacks=[earlystopping, reduce_lr]
)




def preprocess_input(news):
    words = news.lower().split()
    encoded_review = [word_index.get(word, 2) for word in words]
    padded_review = pad_sequences([encoded_review], maxlen=max_len)
    return padded_review

def predict_news(news):
    preprocessed_text = preprocess_input(news)
    prediction = model.predict(preprocessed_text)[0][0]
    sentiment = 'DisasterRelated' if prediction > 0.5 else 'Not Related'
    return sentiment, prediction

news = "BJP leader Nalin Kohli echoed the sentiment, highlighting Gat"
result, score = predict_news(news)
print(f'Result: {result}')
print(f'Score: {score:.2f}')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\negia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\negia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/25




[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 61ms/step - accuracy: 0.6217 - loss: 2.2215 - val_accuracy: 0.7693 - val_loss: 0.5458 - learning_rate: 0.0010
Epoch 2/25
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - accuracy: 0.6743 - loss: 0.7186 - val_accuracy: 0.5616 - val_loss: 0.7286 - learning_rate: 0.0010
Epoch 3/25
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 59ms/step - accuracy: 0.5627 - loss: 0.7184 - val_accuracy: 0.5616 - val_loss: 0.6989 - learning_rate: 0.0010
Epoch 4/25
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 59ms/step - accuracy: 0.5583 - loss: 0.7031 - val_accuracy: 0.5616 - val_loss: 0.6962 - learning_rate: 0.0010
Epoch 5/25
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 58ms/step - accuracy: 0.5823 - loss: 0.6886 - val_accuracy: 0.5616 - val_loss: 0.6905 - learning_rate: 5.0000e-04
Epoch 6/25
[1m  5/153[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 54m

In [11]:
news = "Welcoming Mr. Jha to the part, the former Delhi CM said, “Anil Jha, who works for the backward people in Delhi will strengthen"
result, score = predict_news(news)
print(f'Result: {result}')
print(f'Score: {score:.2f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Result: DisasterRelated
Score: 0.97
