In [52]:
!pip install wikipedia

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import nltk
import re
import spacy
import wikipedia
from geopy.geocoders import Nominatim
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.optimizers import Adam

nlp = spacy.load("en_core_web_sm")

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [2]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_text = [word for word in word_tokens if word not in stop_words and re.match(r'^[a-zA-Z]+$', word)]
    filtered_text = [word for word in filtered_text if word != 'http']
    return ' '.join(filtered_text).lower()

def extract_city_country(text):
    # Split the text at the comma and clean up extracted data
    parts = [part.strip() for part in text.split(',')]
    city = parts[0]
    country = parts[1] if len(parts) > 1 else ""
    return city, country

def infer_city_country(text):
    # Use Nominatim geocoder to infer city and country from text
    geolocator = Nominatim(user_agent="my_app")
    location = geolocator.geocode(text)
    if location:
        return location.address.split(',')[-2:]  # Extract city and country from geocoded result
    else:
        return "", ""

def format_location(city, country):
    # Combine city and country into desired format
    return f"{city}, {country}"

def plot_binary_class_distribution(data):
    class_counts = data['target'].value_counts()

    # Plotting the bar chart
    plt.bar(class_counts.index, class_counts.values)
    plt.xlabel('Target Class')
    plt.ylabel('Count')
    plt.title('Binary Class Distribution')

    # Adding count values above each bar
    for i, count in enumerate(class_counts.values):
        plt.text(i, count, str(count), ha='center', va='bottom')

    plt.show()

def pos_tag_text(text):
    doc = nlp(text)
    return " ".join([f"{token.text}_{token.pos_}" for token in doc])

def tok_seq_pad(sentences):
  tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
  tokenizer.fit_on_texts(sentences)
  total_words = len(tokenizer.word_index)

  seq = tokenizer.texts_to_sequences(sentences)
  maxlength = max([len(sentence) for sentence in seq])
  padded = pad_sequences(seq, maxlen=maxlength, padding="post", truncating="post")

  return padded, maxlength, total_words, tokenizer


In [None]:
plot_binary_class_distribution(train)

In [3]:
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DisasterTweets/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DisasterTweets/test.csv")

In [4]:
train['location'] = train['location'].fillna(train['location'].mode()[0])
train = train.dropna()

In [5]:
train['text'] = train['text'].apply(remove_stopwords)

In [6]:
i = 31
rows_to_drop = []

for item in train['location']:
    doc = nlp(item)

    # Extract country entities
    country_entities = [ent.text for ent in doc.ents if ent.label_ == 'GPE']

    # Check if country_entities is empty
    if not country_entities:
        rows_to_drop.append(i)

    i += 1

# Drop the rows with indices in rows_to_drop
train = train.drop(rows_to_drop)

In [7]:
train['tagged_text'] = train['text'].apply(pos_tag_text)

In [31]:
padded, maxlength, total_words, tokenizer = tok_seq_pad(train['text'])

In [49]:
X = padded
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [66]:
adam = Adam(learning_rate=0.0003)

In [67]:
model = Sequential()
model.add(Embedding(10000, 16, input_length=maxlength))
model.add(Bidirectional(LSTM(25)))
model.add(Dense(100, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer=adam, loss="binary_crossentropy", metrics=['accuracy'])

model.fit(X_train, y_train, epochs=15, steps_per_epoch=16, validation_data=(X_test, y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f23680d3dc0>

In [68]:
model.evaluate(X_test, y_test)



[0.5175769329071045, 0.802955687046051]