In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [None]:
# Import the training dataset with only the important colums
train_dataset = pd.read_csv("nlp-getting-started/train.csv", usecols=[3, 4])

# Column 3 is the column of interest in the testing set
test_dataset = pd.read_csv("nlp-getting-started/test.csv", usecols=[3])

In [None]:
#Print first 5 elements of the train dataset
print("Training set: ", train_dataset.head())

In [None]:
#Print first 5 elements of the test dataset
print("Testing set: ", test_dataset.head())

In [None]:
#check for duplicates in the dataset
display(train_dataset.text.describe())

In [None]:
#From the output we can see there are only 7503 unique values out of 7613 values,showing the dataset contains dulicates
#Next we plot a pie chart to show the distribution of the dataset
#We map the values 1 and 0 to disaster and not disaster respectively
values = ['Disaster' if x == 1 else 'Not Disaster' for x in train_dataset.target.unique()]
counts = train_dataset.target.value_counts()

plt.pie(counts, labels=values, autopct='%1.1f%%')
plt.title("The Number of Tweets")
plt.show()

In [None]:
#We can see the data is somewhat evenly distributed

In [None]:
# Cleaning the dataset

#Removing duplicated text
train_dataset.drop_duplicates(inplace=True)


#Function for cleaning text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading and trailing whitespaces
    text = text.strip()
    
    return text

cleaned_train = train_dataset.copy()

# Clean the 'text' column
cleaned_train['text'] = cleaned_train['text'].apply(clean_text)

# Print the cleaned dataset
print(cleaned_train)



In [None]:
#Check for null values
null_values = cleaned_train.isnull().sum()

print(null_values)

In [None]:
#Plot wordclouds for the different classes(disaster and non-disaster)
fig, ax = plt.subplots(1, 2, figsize=(15, 7))

disaster_words = WordCloud(
    background_color='white',
    width=800,
    height=600
).generate(" ".join(cleaned_train[cleaned_train['target'] == 1]['text']))

not_disaster_words = WordCloud(
    background_color='white',
    width=800,
    height=600
).generate(" ".join(cleaned_train[cleaned_train['target'] == 0]['text']))

ax[0].imshow(disaster_words, interpolation='bilinear')
ax[0].set_title('\nDisaster Words\n', fontsize=16)
ax[0].axis('off')
print('\n')
ax[1].imshow(not_disaster_words, interpolation='bilinear')
ax[1].set_title('\nNot Disaster Words\n', fontsize=16)
ax[1].axis('off')

plt.show()

In [None]:
#Processing data for traing on our models

#Spliting data into input features x and y
X = cleaned_train['text'].values
y = cleaned_train['target'].values

#Splitting dataset into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Transforming the training data using a CountVectorizer
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)

# Using a TfidfTransformer to transform the count vectors
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Transforming the test data
X_test_counts = count_vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)


In [None]:
# Train 3 different models on the  preprocessed dataset 
# and compare their metrics,to get the best one.

In [None]:
# Train a Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, y_train)

# Train a Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Train a Linear Support Vector Classifier (SVC) model
svc_model = LinearSVC()
svc_model.fit(X_train_tfidf, y_train)


In [None]:
# Predict on the test set
logreg_pred = logreg_model.predict(X_test_tfidf)
nb_pred = nb_model.predict(X_test_tfidf)
svc_pred = svc_model.predict(X_test_tfidf)


In [None]:
# Compare the metrics of the models
def print_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"Metrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print()

print_metrics(y_test, logreg_pred, "Logistic Regression")
print_metrics(y_test, nb_pred, "Multinomial Naive Bayes")
print_metrics(y_test, svc_pred, "Linear SVC")

print_metrics(y_test, logreg_pred, "Logistic Regression")
print_metrics(y_test, nb_pred, "Multinomial Naive Bayes")
print_metrics(y_test, svc_pred, "Linear SVC")

In [None]:
# Use the selected model to classify the data in the test 
# dataset and map it to a .csv file for submission

In [None]:
# Transform the test dataset
print(test_dataset.keys())
test_dataset_cleaned = test_dataset.copy()
test_dataset_cleaned['text'] = test_dataset_cleaned['text'].apply(
    clean_text)
X_submission = test_dataset_cleaned['text'].values
X_submission_counts = count_vectorizer.transform(X_submission)
X_submission_tfidf = tfidf_transformer.transform(X_submission_counts)

In [None]:
# Make predictions on the submission dataset
def model_classification(model, model_name="model"):
    submission_pred = model.predict(X_submission_tfidf)

    # Create a submission DataFrame
    submission_df = pd.DataFrame({
        'id': test_dataset['id'],
        'target': submission_pred
    })

    # Save the submission DataFrame to a .csv file
    submission_df.to_csv("nlp-getting-started/" +
                         model_name + "_submission.csv", index=False)

In [None]:
# Use the selected model for classification
# Logistic Regression model
model_classification(logreg_model, "logistic_regression_model")

# Multinomial Naive Bayes model
model_classification(nb_model, "multinomial_naive_bayes_model")

# Linear Support Vector Classifier (SVC) model
model_classification(svc_model, "linear_support_vector_classifier_model")