## Natural Language Processing with Disaster Tweets

In [2]:
import pandas as pd 
import nltk

In [3]:
train_tweets = pd.read_csv("train.csv")
test_tweets = pd.read_csv("test.csv")

### Data Preprocessing Steps

1. Missing Value Impuation

In [11]:
# Checking Missing Vlaues

train_tweets_Missing_Value = train_tweets.isnull()
print(train_tweets_Missing_Value.sum())

print("-------")

test_tweets_missing_value = test_tweets.isnull()
print(test_tweets_missing_value.sum())

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
-------
id             0
keyword       26
location    1105
text           0
dtype: int64


In [13]:
# Null value Imputation 

train_Imputation = train_tweets_Missing_Value['keyword'].fillna(train_tweets_Missing_Value['keyword'].mode()[0], inplace = True) 
train_location_Imputation = train_tweets_Missing_Value['location'].fillna(train_tweets_Missing_Value['location'].mode()[0], inplace = True) 

test_Imputation = test_tweets_missing_value['keyword'].fillna(test_tweets_missing_value['keyword'].mode()[0], inplace = True) 
test_location_Imputation = test_tweets_missing_value['location'].fillna(test_tweets_missing_value['location'].mode()[0], inplace = True) 


2. Text to Lowercase

    Purpose of converting text to lowercase
* Normalization
* Consistency
* Reducing vocabulary size
* Improving Generalization

In [15]:
# Lowering the text 

train_tweets_lowercase = train_tweets["text"].str.lower() 

test_tweets_lowercase = test_tweets["text"].str.lower()

3. Removing Punctuation

In [16]:
import string 

train_tweets["text"] = train_tweets["text"].str.replace('[{}]'.format(string.punctuation), '') 

test_tweets["text"] = test_tweets["text"].str.replace('[{}]'.format(string.punctuation), '') 

  train_tweets["text"] = train_tweets["text"].str.replace('[{}]'.format(string.punctuation), '')
  test_tweets["text"] = test_tweets["text"].str.replace('[{}]'.format(string.punctuation), '')


4. Removing Special Character and Number

In [17]:
# Removing the special character and number 

train_tweets["text"] = train_tweets["text"].str.replace('[^a-zA-Z\s]', '') 

test_tweets["text"] = test_tweets["text"].str.replace('[^a-zA-Z\s]', '') 

  train_tweets["text"] = train_tweets["text"].str.replace('[^a-zA-Z\s]', '')
  test_tweets["text"] = test_tweets["text"].str.replace('[^a-zA-Z\s]', '')


5. Removing Stop Words:

These words are frequently used in a language but typically do not carry much meaning by themselves.The removal of stop words is done to reduce the dimensionality of data.

In [18]:
# Download NLTK stop words

from nltk.corpus import stopwords

nltk.download('stopwords')

# Defining stop words
stop_words = set(stopwords.words('english'))

# Applying stop words
train_tweets["text"] = train_tweets["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

test_tweets["text"] = test_tweets["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


6. Tokenization:

Tokenization is the process of breaking down a text into smaller units, typically words or subwords, referred to as tokens.

In [19]:
from nltk.tokenize import word_tokenize

train_tweets["text"] = train_tweets["text"].apply(word_tokenize) 

test_tweets["text"] = test_tweets["text"].apply(word_tokenize) 


7. Lemmatization

Lemmatization is a process of reducing words into thier base word or canonical form. The output we will grt after lemmatization is called lemma.

In [20]:
from nltk.stem import WordNetLemmatizer

# Download NLTK WordNet
nltk.download('wordnet')

# Apply Lemmatization
lemmatizer = WordNetLemmatizer()

train_tweets["text"] = train_tweets["text"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

test_tweets["text"] = test_tweets["text"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abhic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


8. train validation split 

In [28]:
X = train_tweets["text"].astype(str)
Y = train_tweets["target"]

In [29]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.3, random_state=42)

9. Vectorization: 

vectorization refers to the process of converting textual data into a numerical format

TF - IDF(Term Frequency - Inverse Document Frequency):

TF: It measures how often a term (word) appears in a document.

IDF: It measures how unique or rare a term is across a collection of documents(Corpus).

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_extraction = TfidfVectorizer(min_df = 1)

X_train_features = feature_extraction.fit_transform(X_train)
X_val_features = feature_extraction.transform(X_val)

## Model Building

### 1. Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression 

model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5)

model.fit(X_train_features, y_train)

In [34]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features) 

In [35]:
from sklearn.metrics import accuracy_score 

accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)
print("Accuracy_on_training_data:",accuracy_on_training_data)

Accuracy_on_training_data: 0.8217301557515482


In [37]:
# predication on validation data 

prediction_on_val_data = model.predict(X_val_features) 

In [38]:
accuracy_on_training_data = accuracy_score(y_val, prediction_on_val_data)
print("Accuracy_on_training_data:",accuracy_on_training_data)

Accuracy_on_training_data: 0.7911558669001751


### 2. Random Forest CLassifier 

In [79]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42, criterion='gini',max_depth=25)
random_forest_model.fit(X_train_features, y_train)

In [80]:
# Predicting on train data 

rfc_predictio_train = random_forest_model.predict(X_train_features)

In [81]:
# Evaluate the model on the traing set
accuracy = accuracy_score(y_train, rfc_predictio_train)
print(f"Validation Accuracy: {accuracy:.2%}")

Validation Accuracy: 74.03%


In [82]:
# Predict on the validation set
rfc_predictio_val = random_forest_model.predict(X_val_features)

In [83]:
# Evaluate the model on the validation set
accuracy = accuracy_score(y_val, rfc_predictio_val)
print(f"Validation Accuracy: {accuracy:.2%}")

Validation Accuracy: 71.54%


### 3. SVC (Support Vector Machine)

In [103]:
from sklearn.svm import SVC

svm_model = SVC(kernel='sigmoid')  
svm_model.fit(X_train_features, y_train)

In [104]:
# Make predictions on the traning set
train_svc_predictions = svm_model.predict(X_train_features)

In [105]:
accuracy = accuracy_score(y_train, train_svc_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9071120285231751


In [106]:
# Make predictions on the validation set
val_svc_predictions = svm_model.predict(X_val_features)

In [107]:
accuracy = accuracy_score(y_val, val_svc_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8029772329246935


#### 4. MultinomialNB


In [135]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB(alpha=1)
nb_model.fit(X_train_features, y_train)

In [136]:
MNB_train_predictions = nb_model.predict(X_train_features)

In [137]:
accuracy = accuracy_score(y_train, MNB_train_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9084255957965848


In [138]:
# Make predictions on the validation set
MNB_test_predictions = nb_model.predict(X_val_features)

In [139]:
# Evaluate the model
accuracy = accuracy_score(y_val, MNB_test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8025394045534151


### CNN

In [140]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [145]:
# Convert sparse matrices to dense arrays
X_train_dense = X_train_features.toarray()
X_val_dense = X_val_features.toarray()


In [146]:
max_length = max(max(X_train_dense.shape[1], 0), max(X_val_dense.shape[1], 0))


In [147]:
# Pad sequences
X_train_pad = pad_sequences(X_train_dense, maxlen=max_length, padding='post')
X_val_pad = pad_sequences(X_val_dense, maxlen=max_length, padding='post')

In [None]:
# Build a simple feedforward neural network
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_length),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])