# Discovery One
## Twitter Disaster Prediction NLP

### Imports

In [None]:
# necessary imports
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import string
import re
import spacy
nlp = spacy.load("en_core_web_sm-2.3.1", disable=["tagger", "parser", "ner"])
import pandas as pd
from nltk.corpus import wordnet
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
import pickle
import os

# Word Sense Disambiguation

## Load data

In [None]:
def load_data(path, truth=0):
    x = []
    y = []
    df = pd.read_csv(path)
    for i in df.index:
        x.append(df.at[i, "text"])
        if truth:
            y.append(df.at[i, "target"])
    return x, y

def load_data_indices(path, truth=0):
    x = []
    y = []
    df = pd.read_csv(path)
    for i in df.index:
        x.append((df.at[i,"id"],df.at[i, "text"]))
        if truth:
            y.append(df.at[i, "target"])
    return x, y

x_train, y_train = load_data("train.csv",1)
x_train_indices, _ = load_data_indices("train.csv")
x_test, _ = load_data("test.csv")
x_test_indices, _ = load_data_indices("test.csv")

### Create a y_test array
I am using a test_predictions file to inner merge on ids with the ground truth file. This will get rid of any ids in the ground truth file that are used in our training set.

Then I will take the target_groundtruth column from the merged dataframe, and place that array into the variable y_test

In [None]:
# Read the predicted file
df_test = pd.read_csv('test_predictions_cv.csv')
df_groundtruth = pd.read_csv('ground_truth.csv')

# Merge the DataFrames on 'id' column to have a single DataFrame for comparison
merged_df = pd.merge(df_test, df_groundtruth, on='id', suffixes=('_predicted', '_groundtruth'))
y_test = merged_df.iloc[:, 2]

### Strip punctuation and get word features

I am using the string package to strip punctuate marks, and a regex to get rid of any http/com data.

Then I clean the words, creating a new variable to hold the cleaned individual words, before combining into a final array of all the words in the data.

Finally, I flatten the 2D array in order to find the frequency of the words

In [None]:
# Preprocessing function to clean HTML tags and perform basic cleanup
def preprocessor(text):
    html_regex = "<[^>]*>*<[^>]*>"
    if type(text) == str:
        text = re.sub(html_regex, "", text)
        text = re.sub("[\W]+", "", text.lower())
    return text

# Function to tokenize, remove stop words, and lemmatize
def tokenize(doc):
    tokens = nlp(doc)
    return [preprocessor(token.lemma_) for token in tokens if not token.is_stop]

# Applying preprocessing steps to x_train and x_test
all_words_xtrain = [tokenize(doc) for doc in x_train]
all_words_xtest = [tokenize(doc) for doc in x_test]

cleaned_words = all_words_xtrain+all_words_xtest

# Printing the first sentence's cleaned words
print(cleaned_words[0])

# Flatten the 2D array to get a list of all words
flattened_words = [word for sublist in cleaned_words for word in sublist]

# Count the frequency of each word
all_words = nltk.FreqDist(flattened_words)

#### Create feature sets for training data

In [None]:
def find_features(document_words):
    features = {word: (word in document_words) for word in word_features}
    return features
word_features = list(all_words.keys())

In [None]:
documents_train = list(zip(all_words_xtrain, y_train))
    
training_set = [(find_features(words), label) for (words, label) in documents_train]

#### Create feature sets for testing data

In [None]:
documents_test = list(zip(all_words_xtest, y_test))

testing_set = [(find_features(words), label) for (words, label) in documents_test]

#### Prediction Grading

In [None]:
def evaluate_predictions(predictions, ground_truth):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    # Iterate through the merged DataFrame to compare targets
    for pred, truth in zip(predictions, ground_truth):
            if pred == 1 and truth == 1:
                tp += 1
            elif pred == 0 and truth == 0:
                tn += 1
            elif pred == 1 and truth == 0:
                fp += 1
            elif pred == 0 and truth == 1:
                fn += 1

    # Print the counts of TP, FP, TN, FN
    print(f"True Positives (TP): {tp}")
    print(f"False Positives (FP): {fp}")
    print(f"True Negatives (TN): {tn}")
    print(f"False Negatives (FN): {fn}")
    print(f"Precision (TP/TP+FP): {tp/(tp+fp)}")
    print(f"Recall (TP/TP+FN): {tp/(tp+fn)}")

### WSD with Naive Bayes

In this section I am training the Naive Bayes Classifier on the training set.

I am saving the classifier as a pickle file so that I do not have to retrain every time I want to look at predictions for the testing data.

In order to train the classifier over again (for instance if you are changing the training set or modifying the options) you must delete the appropriate pickle file.

This process will take quite a bit of time (>5 minutes), even using maximum cores and RAM from Jupyter.

In [None]:
if os.path.exists("naivebayes.pickle"):
    # If the pickle exists, load and use it
    with open("naivebayes.pickle", "rb") as classifier_f:
        classifier = pickle.load(classifier_f)
else:
    # If the pickle doesn't exist, retrain the classifier and save it
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    with open("naivebayes.pickle", "wb") as save_classifier:
        pickle.dump(classifier, save_classifier)
        
test_predictions = [classifier.classify(features) for features, label in testing_set]  
accuracy = nltk.classify.accuracy(classifier, testing_set)
print("Naive Bayes Classifier accuracy percent:",(accuracy*100))
evaluate_predictions(test_predictions, y_test)

### WSD with MultinomialNB

In this section, I am training another Naive Bayes classifier from within the SKLearnClassifier package, Multionmial Naive Bayes
The accuracy is slightly better with this algorithm.

In [None]:
if os.path.exists("MNB_classifier.pickle"):
    # If the pickle exists, load and use it
    with open("MNB_classifier.pickle", "rb") as classifier_f:
        MNB_classifier = pickle.load(classifier_f)
else:
    # If the pickle doesn't exist, retrain the classifier and save it
    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    with open("MNB_classifier.pickle", "wb") as save_classifier:
        pickle.dump(MNB_classifier, save_classifier)
        
test_predictions_MNB = [MNB_classifier.classify(features) for features, label in testing_set]  
accuracy_MNB = nltk.classify.accuracy(MNB_classifier, testing_set)
print("MultinomialNB accuracy percent:", accuracy_MNB*100)
evaluate_predictions(test_predictions_MNB, y_test)