In [1]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [2]:
train_df = pd.read_csv('/home/karen/Documents/Datasets/Disaster_tweets/train.csv')
test_df = pd.read_csv('/home/karen/Documents/Datasets/Disaster_tweets/test.csv')
submission_df = pd.read_csv('/home/karen/Documents/Datasets/Disaster_tweets/sample_submission.csv')

In [3]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [6]:
#Example of disaster tweet
train_df[train_df['target'] == 1]['text'].values[1]

'Forest fire near La Ronge Sask. Canada'

In [7]:
#Non disaster tweet
train_df[train_df['target'] == 0]['text'].values[1]

'I love fruits'

### Vectorization using count_vectorizer

In [8]:
#Build vectors 
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

#get counts for the first 5 tweets in the data
example_vectors = count_vectorizer.fit_transform(train_df['text'][:5])

In [9]:
# we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_vectors[0].todense().shape)
print(example_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [10]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])

# note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [11]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier()

In [12]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
# scores

In [13]:
# clf.fit(train_vectors, train_df["target"])

### Vectorization usinf TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

In [15]:
tfidf_train_vectors = tfidf_vectorizer.fit_transform(train_df["text"])

tfidf_test_vectors = tfidf_vectorizer.transform(test_df["text"])

In [16]:
clf.fit(tfidf_train_vectors, train_df["target"])

In [17]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, tfidf_train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.63366337, 0.6122449 , 0.68407835])

### Using Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

classifier.fit(tfidf_train_vectors, train_df["target"])

In [19]:
scores = cross_val_score(classifier, tfidf_train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.53655435, 0.47176913, 0.57849339])

### Using XG-Boost

In [20]:
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

In [21]:
xgb_cl.fit(tfidf_train_vectors, train_df["target"])

In [22]:
scores = cross_val_score(xgb_cl, tfidf_train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.5773433 , 0.51843934, 0.60697553])

### Using LSTM

In [25]:
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

2023-07-15 21:44:44.077883: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-15 21:44:49.463427: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-07-15 21:44:49.464121: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [40]:
max_len = 150
max_words = 1000
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(train_df["text"])
sequences = tok.texts_to_sequences(train_df["text"])
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [41]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 150)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 150, 50)           50000     
                                                                 
 lstm_1 (LSTM)               (None, 64)                29440     
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_2 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

In [42]:
model.fit(sequences_matrix, train_df["target"],batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.src.callbacks.History at 0x7fe17114a110>

In [44]:
test_sequences = tok.texts_to_sequences(test_df["text"])
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

### Submission

In [45]:
submission_df["target"] = model.predict(test_sequences_matrix)



In [46]:
submission_df

Unnamed: 0,id,target
0,0,0.596359
1,2,0.781721
2,3,0.846296
3,9,0.244475
4,11,0.851528
...,...,...
3258,10861,0.793688
3259,10865,0.784926
3260,10868,0.906668
3261,10874,0.878629


In [None]:
submission_df.to_csv("submission.csv", index=False)