In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('disaster_tweets_data(DS).csv')

In [6]:
# Check for null values
df.isnull().sum()

# Drop rows with null values (if any)
df.dropna(inplace=True)

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function for text preprocessing
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    words = text.split()

    # Remove stop words and apply stemming/lemmatization
    words = [lemmatizer.lemmatize(ps.stem(word)) for word in words if word not in stopwords.words('english')]

    return ' '.join(words)

# Apply the preprocessing to the tweets
df['cleaned_tweet'] = df['tweets'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Transform the text data into vectors
X = tfidf.fit_transform(df['cleaned_tweet']).toarray()
y = df['target']


In [9]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb_model.predict(X_test)

In [11]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = lr_model.predict(X_test)


In [12]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = knn_model.predict(X_test)


In [13]:
from sklearn.metrics import confusion_matrix, classification_report

# Confusion matrix and classification report for Multinomial Naive Bayes
print("Multinomial Naive Bayes:")
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# Confusion matrix and classification report for Logistic Regression
print("Logistic Regression:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# Confusion matrix and classification report for KNN
print("KNN:")
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


Multinomial Naive Bayes:
[[770 104]
 [196 453]]
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       874
           1       0.81      0.70      0.75       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523

Logistic Regression:
[[761 113]
 [204 445]]
              precision    recall  f1-score   support

           0       0.79      0.87      0.83       874
           1       0.80      0.69      0.74       649

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523

KNN:
[[868   6]
 [497 152]]
              precision    recall  f1-score   support

           0       0.64      0.99      0.78       874
           1       0.96      0.23      0.38       649

    accuracy                           0.67      1523
   macro

In [14]:
print('The Model with best accuracy is produced by Multinomial Naive Bayes algorithm')

The Model with best accuracy is produced by Multinomial Naive Bayes algorithm
