## Natural Language Processing with Disaster Tweets

In [1]:
import pandas as pd 
import nltk

In [2]:
tweets = pd.read_csv("train.csv")
tweets.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
tweets.shape

(7613, 5)

### Data Preprocessing Steps 

**Missing Value Impuation**

In [4]:
# Checking Missing Vlaues

Missing_Value = tweets.isnull()
Missing_Value.sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
Imputation = tweets['keyword'].fillna(tweets['keyword'].mode()[0], inplace = True) 

In [6]:
location_Imputation = tweets['location'].fillna(tweets['location'].mode()[0], inplace = True) 

In [7]:
Missing_Value = tweets.isnull()
Missing_Value.sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

**Text to Lowercase**



**Purpose of converting text to lowercase**

* Normalization 
* Consistency 
* Reducing vocabulary size
* Improving Generalization 

In [8]:
lowercase = tweets["text"].str.lower() 

**Removing Punctuation**

In [9]:
import string 

In [10]:
tweets["text"] = tweets["text"].str.replace('[{}]'.format(string.punctuation), '') 

  tweets["text"] = tweets["text"].str.replace('[{}]'.format(string.punctuation), '')


**Removing Special Character and Number**

In [11]:
tweets["text"] = tweets["text"].str.replace('[^a-zA-Z\s]', '') 

  tweets["text"] = tweets["text"].str.replace('[^a-zA-Z\s]', '')


**Removing Stop Words:**

These words are frequently used in a language but typically do not carry much meaning by themselves.The removal of stop words is done to reduce the dimensionality of data.

In [12]:
from nltk.corpus import stopwords
import nltk 

# Download NLTK stop words
nltk.download('stopwords')

# Defining stop words
stop_words = set(stopwords.words('english'))

# Applying stop words
tweets["text"] = tweets['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### ***Tokenization***

**Tokenization:**

Tokenization is the process of breaking down a text into smaller units, typically words or subwords, referred to as tokens.

In [13]:
from nltk.tokenize import word_tokenize

tweets['text'] = tweets['text'].apply(word_tokenize) 

### ***Lemmatization***

**Lemmatization:**

Lemmatization is a process of reducing words into thier base word or canonical form. The output we will grt after lemmatization is called lemma.

In [14]:
from nltk.stem import WordNetLemmatizer

# Download NLTK WordNet
nltk.download('wordnet')

# Apply Lemmatization
lemmatizer = WordNetLemmatizer()
tweets['text'] = tweets['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
X = tweets['text'].astype(str)
Y = tweets['target']

In [14]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

### ***Vectorization***

**Word2Vec**

In [15]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
 
# Word2Vec model training
X_train_feature = Word2Vec(X_train, vector_size=100, window=5, min_count=1, workers=4) 
X_val_feature = Word2Vec(X_val, vector_size=100, window=5, min_count=1, workers=4) 

**TF - IDF**(Term Frequency - Inverse Document Frequency):



**TF:** It measures how often a term (word) appears in a document.

**IDF:** It measures how unique or rare a term is across a collection of documents(Corpus).



In [16]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(6090,)
(1523,)
(6090,)
(1523,)


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 

In [18]:
feature_extraction = TfidfVectorizer(min_df = 1)

X_train_features = feature_extraction.fit_transform(X_train)
X_val_features = feature_extraction.transform(X_val)

In [19]:
print(X_train_features.shape)
print(X_val_features.shape)

(6090, 18383)
(1523, 18383)


### ***Model Building***

### LogisticRegression

In [20]:
model = LogisticRegression()

In [21]:
model.fit(X_train_features, y_train)

In [22]:
prediction_on_training_data = model.predict(X_train_features) 

In [23]:
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)
print("Accuracy_on_training_data:",accuracy_on_training_data)

Accuracy_on_training_data: 0.8922824302134647


In [24]:
prediction_on_val_data = model.predict(X_val_features) 

In [26]:
accuracy_on_val_data = accuracy_score(y_val, prediction_on_val_data) 
print("Accuracy_on_test_data:",accuracy_on_val_data)

Accuracy_on_test_data: 0.8069599474720945


In [28]:
input_your_tweet = ['Boom Blast desiter on road']

input_data_feature = feature_extraction.transform(input_your_tweet)

prediction = model.predict(input_data_feature)
print(prediction)

[0]


### CNN

In [29]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
# Tokenize and pad the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [32]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_val)

In [33]:
vocab_size = len(tokenizer.word_index) + 1
max_length = max(max(len(seq) for seq in X_train_seq), max(len(seq) for seq in X_test_seq))

In [34]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [35]:
# Build a simple feedforward neural network
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_length),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [36]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [38]:
# Train the model
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
  3/153 [..............................] - ETA: 12s - loss: 0.0351 - accuracy: 0.9896

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1ce934fa950>

In [43]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_pad, y_val)
print(f"\nVal Set Accuracy: {test_accuracy * 100:.2f}%")


Val Set Accuracy: 75.64%


#### MultinomialNB


In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [46]:
# Train a Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_features, y_train)

In [52]:
predictions = nb_model.predict(X_train_features)

In [53]:
# Evaluate the model
accuracy = accuracy_score(y_train, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8931034482758621


In [47]:
# Make predictions on the validation set
predictions = nb_model.predict(X_val_features)

In [51]:
# Evaluate the model
accuracy = accuracy_score(y_val, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7977675640183848


In [49]:
# Display classification report
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       0.77      0.93      0.84       874
           1       0.87      0.62      0.72       649

    accuracy                           0.80      1523
   macro avg       0.82      0.77      0.78      1523
weighted avg       0.81      0.80      0.79      1523



#### SVC


In [55]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [56]:
# Train a Support Vector Machine (SVM) model
svm_model = SVC(kernel='linear')  # You can experiment with different kernels (linear, rbf, etc.)
svm_model.fit(X_train_features, y_train)

In [59]:
# Make predictions on the validation set
predictions = svm_model.predict(X_train_features)

In [61]:
accuracy = accuracy_score(y_train, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9361247947454844


In [57]:
# Make predictions on the validation set
predictions = svm_model.predict(X_val_features)

In [58]:
accuracy = accuracy_score(y_val, predictions)
print(f'Accuracy: {accuracy}')


Accuracy: 0.804333552199606


### Logistic Regression 

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


In [64]:
# Initialize Logistic Regression model
logistic_model = LogisticRegression()

In [68]:
# Perform cross-validation
cv_scores = cross_val_score(logistic_model, X_train_features, y_train, cv=5, scoring='accuracy')

In [69]:
# Print the cross-validated accuracy scores
print("Cross-validated Accuracy Scores:", cv_scores)
print("Mean Accuracy: ", cv_scores.mean())

Cross-validated Accuracy Scores: [0.79802956 0.80706076 0.80788177 0.77339901 0.78981938]
Mean Accuracy:  0.7952380952380953


In [70]:
# Perform cross-validation
cv_scores = cross_val_score(logistic_model, X_val_features, y_val, cv=5, scoring='accuracy')

In [71]:
# Print the cross-validated accuracy scores
print("Cross-validated Accuracy Scores:", cv_scores)
print("Mean Accuracy: ", cv_scores.mean())

Cross-validated Accuracy Scores: [0.7442623  0.77704918 0.74754098 0.70394737 0.79934211]
Mean Accuracy:  0.7544283865401209


### Test Data

In [2]:
import pandas as pd 
import nltk

In [3]:
tweets = pd.read_csv("test.csv")
tweets.head(5)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
tweets.shape

(3263, 4)

In [5]:
# Checking Missing Vlaues

Missing_Value = tweets.isnull()
Missing_Value.sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [6]:
Imputation = tweets['keyword'].fillna(tweets['keyword'].mode()[0], inplace = True) 
location_Imputation = tweets['location'].fillna(tweets['location'].mode()[0], inplace = True) 

In [7]:
Missing_Value = tweets.isnull()
Missing_Value.sum()

id          0
keyword     0
location    0
text        0
dtype: int64

In [8]:
lowercase = tweets["text"].str.lower() 

In [9]:
import string 
tweets["text"] = tweets["text"].str.replace('[^a-zA-Z\s]', '') 

  tweets["text"] = tweets["text"].str.replace('[^a-zA-Z\s]', '')


In [10]:
from nltk.corpus import stopwords
import nltk 

# Download NLTK stop words
nltk.download('stopwords')

# Defining stop words
stop_words = set(stopwords.words('english'))

# Applying stop words
tweets["text"] = tweets['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from nltk.tokenize import word_tokenize

tweets['text'] = tweets['text'].apply(word_tokenize) 

In [12]:
from nltk.stem import WordNetLemmatizer

# Download NLTK WordNet
nltk.download('wordnet')

# Apply Lemmatization
lemmatizer = WordNetLemmatizer()
tweets['text'] = tweets['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
X = tweets['text'].astype(str)

In [15]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_val = train_test_split(X, test_size=0.2, random_state=42)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
feature_extraction = TfidfVectorizer(min_df = 1)

X_train_features = feature_extraction.fit_transform(X_train)
X_val_features = feature_extraction.transform(X_val)

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


In [20]:
# Initialize Logistic Regression model
logistic_model = LogisticRegression()

In [21]:
# Perform cross-validation
cv_scores = cross_val_score(logistic_model, X_train_features, y_train, cv=5, scoring='accuracy')

NameError: name 'y_train' is not defined

In [None]:
# Print the cross-validated accuracy scores
print("Cross-validated Accuracy Scores:", cv_scores)
print("Mean Accuracy: ", cv_scores.mean())