In [1]:
# e.g. if using google colab import drive, uncomment lines below
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# import packages

import os
import sklearn
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression as sk_OLS
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import torch
import torch.nn.functional as F
import math

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords

import string
import re

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Part (a): Download the data

In [3]:
#====================================================#
# YOUR CODE HERE:
#   Import train and test csv files.
#   You should use the pd.read_csv function.
#   You should set the index_col parameter to equal 'id'.
#====================================================#

train_data = pd.read_csv('train.csv', index_col = 'id')
test_data  = pd.read_csv('test.csv', index_col = 'id')

#====================================================#
# END YOUR CODE
#====================================================#


In [4]:
train_data

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [5]:
#====================================================#
# YOUR CODE HERE:
#   Get the index values for X_train and y_train.
#   Get the data values for X_train and y_train.
#   Get the index values for X_test.
#   Get the index values for y_test.
#====================================================#

# get train indices
X_train_id = train_data.index
y_train_id = train_data.index
# get train data
X_train    = train_data.drop(labels = 'target', axis = 1)
y_train    = train_data.target

# get test indices
X_test_id  = test_data.index
# get test data
X_test     = test_data

#====================================================#
# END YOUR CODE
#====================================================#

print(f"Train Data Shape: {X_train.shape}")
print(f"Test Data Shape: {X_test.shape}")

print(f"Number of labels = 1 in train dataset as percentage: {((y_train == 1).sum() / (X_train.shape[0])) * 100:0.2f}%")
print(f"Number of labels = 0 in train dataset as percentage: {((y_train == 0).sum() / (X_train.shape[0])) * 100:0.2f}%")

Train Data Shape: (7613, 3)
Test Data Shape: (3263, 3)
Number of labels = 1 in train dataset as percentage: 42.97%
Number of labels = 0 in train dataset as percentage: 57.03%


### Part (a), Question 1: How many training and test data points are there?

### Answer:
> + In the training set, there are 7613 observations
> + In the testing set, there are 3263 observations

### Part (a), Question 2: what percentage of the training tweets are of real disasters, and what percentage is not?

### Answer:
> + 42.97% of the training tweets contain real disasters
> + 57.03% of the training tweets do not contain real disasters

# Part (b): Split the training data.

In [6]:
#====================================================#
# YOUR CODE HERE:
#  You should use the sklearn.model_selection.train_test_split
#     parameter to perform the train/development split
#   Set the test_size to 0.30.
#   Set the random_stat parameter to 42.
#====================================================#

X_train_orig, X_develop_orig, y_train_orig, y_develop_orig = train_test_split(X_train, y_train, test_size = 0.3, random_state = 42)

#====================================================#
# END YOUR CODE
#====================================================#

# Part (c): Preprocess the data.

Reason: By converting all words to lowercase, we have a more consistent format, and we will not need to worry about uppercase letter anymore.

In [7]:
# Convert all the words to lowercase
def lowercase(text):
    return text.lower()

Reason: When we lemmatize words, differemt variations of words (e.g., running, runs, and ran) are converted to their base form. This will simplify the analysis and increase the overall accuracy of the model

In [8]:
def get_pos(word):
    # create a dict for mapping
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    # get the right tag
    tag = nltk.pos_tag([word])[0][1][0].upper()
    
    # retrieve the tag for lemmatize
    return tag_dict.get(tag, wordnet.NOUN)

# Lemmatize all the words
def lemmatization(text):
    # Create a WordNetLemmatizer object    
    lemmatizer = WordNetLemmatizer()
    
    # Splitting text into individual words or tokens
    words = word_tokenize(text)
    
    # Apply lemmatization to each token in the list of words
    lemmatized_words = [lemmatizer.lemmatize(word, pos = get_pos(word)) for word in words]
    
    return ' '.join(lemmatized_words)

Reason: Since punctuations in the sentence are not really meaningful in our analysis, eliminating all of them will make our analysis easier.

In [9]:
# Strip punctuation
def punctuation_removal(text):
    # create a table for translation
    table = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    
    # translate the text using the table defined above
    return text.translate(table)

Reason; Same as punctuations, the stopwords in the sentence are not meaningful and cannot provide any information. Therefore, eliminating them will reduce the complexity of the text and make our analysis simpler.

In [10]:
# Strip the stop words, e.g.,“the”,“and”,“or”.
def stopword_removal(text):
    # list of stopwords in English
    stop_words = stopwords.words('english')
    
    # split the text into list
    words = word_tokenize(text.strip())
    
    # filter out words that are not stopwords
    words_filtered = [word for word in words if word not in stop_words]
    
    return ' '.join(words_filtered)

Reason: URLs often contain different combinations of alphanumerics that do not contribute any meaningful information to the NLP tasks, eliminating them will help us reduce noise and focus on the text that are useful for analysis.

In [11]:
# Strip urls.
def url_removal(text):
    return re.sub('https?://\S+|www\.\S+', '', text)

Reason: Obviously, @ symbol does not have any meaning.

In [12]:
# Strip @
def at_removal(text):
    return text.replace('@','')

Reason: There are some strange characters (e.g., "Ûªs") that do not look like any kind of language. They are some kind of Mojibake that do not have any meaning. We need to eliminate them to ensure the text quality.Also, numbers are not really useful and thus we can eliminate them

In [13]:
# Remove anything else except alphabet
def special(text):
    return re.sub('[^a-zA-Z\s]', '', text)

In [14]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function to obtain the pre-processed
#  X_train and X_develop
#  Note that we suggest you to do every sub-question in a dedicated Python
#  function to make the code more structured and less error-prone.
#  With a function, you can clearly test each part when you encounter an error.
#  You can also create your own simple input data (e.g. just one sample) to
#  test the correctness of a function.
#========================================================================#
def pre_process(data):
    ''' the argument for this function is a dataframe'''
    data.text = data.text.apply(lambda x : lowercase(x))
    data.text = data.text.apply(lambda x : lemmatization(x))
    data.text = data.text.apply(lambda x : punctuation_removal(x))
    data.text = data.text.apply(lambda x : stopword_removal(x))
    data.text = data.text.apply(lambda x : url_removal(x))
    data.text = data.text.apply(lambda x : at_removal(x))
    data.text = data.text.apply(lambda x : special(x))
    
    preproc_data = data
    #========================================================================#
    #  This function should return the pre-processed data
    #========================================================================#
    return preproc_data # Feel free to change the variable name

# get the preprocessed data
X_train_preproc   = pre_process(X_train_orig)
X_develop_preproc = pre_process(X_develop_orig)

# Part (d): Bag of words model.

#### In order to find the optimal threshold m, we need to iteratively test which value of m will yield the best model.

In [15]:
from sklearn.model_selection import cross_val_score

m_lst = list(range(1,11))
best_m = None
highest_score = -1

for i in m_lst:
    count_vect = CountVectorizer(binary = True, min_df = i)
    X_train = count_vect.fit_transform(X_train_preproc.text).toarray() # toarray() for better inspection
    X_develop = count_vect.transform(X_develop_preproc.text).toarray()
    
    model = LogisticRegression()
    scores = cross_val_score(model, X_train, y_train_orig, cv = 5)
    
    mean_score = scores.mean()
    
    if mean_score > highest_score:
        highest_score = mean_score
        best_m = i
        
print(f'The best m for min_df is {best_m}')
print(f'The highest score for this best_m is {highest_score}')

The best m for min_df is 1
The highest score for this best_m is 0.7890804992556968


#### Using cross validation, the best value for "min_df" is 1, which will produce the highest score when we conduct Logistic Rregression.

In [16]:
# There is no need to construct a function for this

# vectorize the training set
count_vect = CountVectorizer(binary = True, min_df = best_m)

# transform the text into a count matrix
X_train = count_vect.fit_transform(X_train_preproc.text).toarray() # toarray() for better inspection

# Use the same count_vect object to transform the development set
X_develop = count_vect.transform(X_develop_preproc.text).toarray()

# Part (e): Logistic regression.

#### logistic_without_regularization:
> + The F1 score for training set is 0.9961, meaning the model fits the data pretty well.
> + However, the F1 score for development set is 0.7288
> + Therefore, there could be some overfitting in this case

In [17]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  without regularization terms.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#

def logistic_without_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    model = LogisticRegression(penalty = 'none', multi_class = 'ovr')
    # then fit your model to the train data
    model.fit(X_train, Y_train)
    # then generate your prediction for the training set
    y_train_no_reg = model.predict(X_train)

    # then generate your prediction for the development set
    y_develop_no_reg = model.predict(X_develop)
    #========================================================================#
    #  This function should train a logistic regression model without
    #  regularization terms.
    #  Report the F1 score in your training and in your development sets.
    #========================================================================#
    return y_train_no_reg, y_develop_no_reg

# get the F1 train and develop scores
F1_train_no_reg = f1_score(y_train_orig, logistic_without_regularization(X_train, y_train_orig, X_develop, y_develop_orig)[0])

F1_develop_no_reg = f1_score(y_develop_orig, logistic_without_regularization(X_train, y_train_orig, X_develop, y_develop_orig)[1])

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_no_reg:.4f}")
print(f"F1 for development set: {F1_develop_no_reg:.4f}")

F1 for training set: 0.9961
F1 for development set: 0.7288


#### logistic_L1_regularization:
> + The F1 score for training set is 0.8501, which is also good. Compared to 0.9961 in the logistic model without regularization, we introduce some bias in order to address overfitting.
> + In logistic model with L1 regularization, the F1 score for development set is 0.7478, which is larger than 0.7288 (i.e., the F1 score of development set of logistic model without regularization.
> + Since we slightly increase the performance of the model on the development set, this logistic model with L1 regularization performs better than the one without regularization.

In [18]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  with L1 regularization.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#
def logistic_L1_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    model = LogisticRegression(penalty = 'l1', solver = 'liblinear', multi_class = 'ovr')
    # then fit your model to the train data
    model.fit(X_train, Y_train)
    # then generate your prediction for the training set
    y_train_L1_reg = model.predict(X_train)

    # then generate your prediction for the development set
    y_develop_L1_reg = model.predict(X_develop)
    #========================================================================#
    #  This function should train a logistic regression model with l1
    #  regularization terms.
    #  Report the F1 score in your training and in your development sets.
    #========================================================================#
    return y_train_L1_reg, y_develop_L1_reg

# get the F1 train and develop scores
F1_train_L1_reg = f1_score(y_train_orig, logistic_L1_regularization(X_train, y_train_orig, X_develop, y_develop_orig)[0])

F1_develop_L1_reg = f1_score(y_develop_orig, logistic_L1_regularization(X_train, y_train_orig, X_develop, y_develop_orig)[1])

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_L1_reg:.4f}")
print(f"F1 for development set: {F1_develop_L1_reg:.4f}")

F1 for training set: 0.8503
F1 for development set: 0.7471


#### logistic_L2_regularization:
> + The F1 score for training set is 0.9627, meaning the model fits the training data pretty well.
> + In logistic model with L2 regularization, the F1 score for development set is 0.7533, which is larger than 0.7478 (i.e., the F1 score of development set of logistic model with L1 regularization.

In [19]:
#=======================================================================+#
# YOUR CODE HERE:
#  You should complete the following function for logistic regression
#  with L2 regularization.
#  You will be training logistic regression models using bag of words
#  feature vectors obtained in part (d).
#========================================================================#
def logistic_L2_regularization(X_train, Y_train, X_develop, Y_develop):
    # initialize your logistic regression model
    model = LogisticRegression(penalty = 'l2', multi_class = 'ovr')
    # then fit your model to the train data
    model.fit(X_train, Y_train)
    # then generate your prediction for the training set
    y_train_L2_reg = model.predict(X_train)

    # then generate your prediction for the development set
    y_develop_L2_reg = model.predict(X_develop)
    #========================================================================#
    #  This function should train a logistic regression model with L2
    #  regularization terms.
    #  Report the F1 score in your training and in your development sets.
    #========================================================================#
    return y_train_L2_reg, y_develop_L2_reg

# get the F1 train and develop scores
F1_train_L2_reg = f1_score(y_train_orig, logistic_L2_regularization(X_train, y_train_orig, X_develop, y_develop_orig)[0])
F1_develop_L2_reg = f1_score(y_develop_orig, logistic_L2_regularization(X_train, y_train_orig, X_develop, y_develop_orig)[1])

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_L2_reg:.4f}")
print(f"F1 for development set: {F1_develop_L2_reg:.4f}")

F1 for training set: 0.9627
F1 for development set: 0.7533


### Which one of the three classifiers performed the best on your training and development set? Did you observe any overfitting and did regularization help reduce it? Support your answers with the classifier performance you got.

### Answer:
> + In general, the logistic regression model with L2 regularization performs best since it has the highest F1 scores for the development sets.
> + In the logistic regression model without regularization, we notice that there exists overfitting since the F1 score for training set is much larger than the F1 score for the development set.
> + By introducing some bias to the training set, both L1 and L2 regularization slightly reduce the overfitting since they have higher F1 score of the development set compared to the F1 score of development set of logistic regression model without regularization.

### Inspect the weight vector of the classifier with L1 regularization (in other words, look at the θ you got after training). You can access the weight vector of the trained model using the coef_attribute of a LogisticRegression instance. What are the most important words for deciding whether a tweet is about a real disaster or not? You might need to run some code (feel free to insert a code cell below).

In [20]:
model = LogisticRegression(penalty = 'l1', solver = 'liblinear')
model.fit(X_train, y_train_orig)

coefficients = model.coef_[0]
feature_importance = abs(coefficients)
sorted_indices = feature_importance.argsort()[::-1]

feature_name = count_vect.get_feature_names()

top_words = 10
for i,j in enumerate(range(top_words)):
    print(i+1, feature_name[sorted_indices[j]])

1 spill
2 derailment
3 typhoon
4 hiroshima
5 earthquake
6 migrant
7 debris
8 worth
9 wildfire
10 outbreak


### Answer: Most important words for deciding whether a tweet is about a real disaster or not (Top 10):
> + 1 -> spill
> + 2 -> derailment
> + 3 -> typhoon
> + 4 -> hiroshima
> + 5 -> earthquake
> + 6 -> migrant
> + 7 -> debris
> + 8 -> worth
> + 9 -> wildfire
> + 10 -> outbreak

# Part (f): Bernoulli Naive Bayes.

### The reference for this code is ChatGPT, however, I implemented it myself.

In [21]:
class BernoulliNB(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y): 
        # class k
        self.K = np.unique(y)
        
        # create a dictionary to store conditional probabilities
        self.conditional_probs = {}
        self.feature_count = {}
        
        # create an array to store probs for each class
        self.K_probs = np.array([])
        
        for k in self.K:
            # filter each class
            X_k = X[y == k]
            
            # calculate probs for each class
            self.K_probs = np.append(self.K_probs, X_k.shape[0] / X.shape[0])
            
            # total number of features and feature counts for class k
            total_features = X_k.sum()
            feature_counts = X_k.sum(axis = 0)
            
            # Laplace smoothing
            self.conditional_probs[k] = (feature_counts + self.alpha) / (total_features + self.alpha * X.shape[1])
            self.feature_count[k] = total_features

        return self

    def predict(self, X):
        y_pred = []

        for i in X:
            post = []
            for k in self.K:
                # log likelihood for k class
                log_lh = np.sum(np.log(self.conditional_probs[k]) * i)

                # log posterior for k class
                log_post = log_lh + np.log(self.K_probs[k])

                post.append(log_post)

            # set the class with the highest log posterior
            y_pred.append(self.K[np.argmax(post)])

        return y_pred

# get the predictions y_train_NB and y_develop_NB
nb = BernoulliNB(alpha=1)
nb.fit(X_train, y_train_orig)
y_train_NB = nb.predict(X_train) # prediction from X_train using model
y_develop_NB = nb.predict(X_develop) # prediction from X_develop using model

# get the F1 train and develop scores
F1_train_NB = f1_score(y_train_orig, y_train_NB)
F1_develop_NB = f1_score(y_develop_orig, y_develop_NB)

# print the F1 train and develop scores
print(f"F1 for training set: {F1_train_NB:.4f}")
print(f"F1 for development set: {F1_develop_NB:.4f}")

F1 for training set: 0.9129
F1 for development set: 0.7588


# Part (g): Model comparison.

Question: Which model performed the best in predicting whether a tweet is of a real disaster or not? Include your performance metric in your response. Comment on the pros and cons of using generative vs discriminative models.

Answer: 
#### Sources: ChatGPT and Classnotes
> + In predicting whether a tweet is of a real disaster or not, the generative classifier (i.e., Bernoulli Naive Bayes) performs best since it has the highest F1 score for the development set, which is 0.7588.
> + **Generative models:**
> + Pros:
>> 1. Generative models require less sample. They perform well with fewer labeled exmaples since they can capture the underlying data distribution.
>> 2. Generative models can handle missing data better since thye model the joint distribution. They can infer missing features and generate samples with missing values.
> + Cons:
>> 1. Compared to discriminative models, generative models are generally more complex and computationally expensive.
>> 2. If training data is limited, generative models are prone to overfitting since they capture the full data distribution.
> + **Discriminative models:**
> + Pros:
>> 1. In practice, discriminative models are often more accurate for classification tasks.
>> 2. Discriminative models are simpler to train since they focus on modeling the decision boundary between classes rather than the whole data distribution.

> + Cons:
>> 1. The performance of discriminative models can be affected by out-of-distribution data because they do not model the uncertainty in the data distribution as explicitly as generative models.
>> 2. Unlike generative models, discriminative models need a larger amount of labeled data to perform well, especially when class boundaries are not well-defined.

Question: hink about the assumptions that Naive Bayes makes. How are the assumptions different from logistic regressions? Discuss whether it is valid and efficient to use Bernoulli Naive Bayes classifier for natural language texts.

Answer:
#### Sources: ChatGPT
#### Differences in assumptions:
> + Logistic regression does not assume independence amoung features. However, Naive Bayes assumes feature independence.
> + Logistic regression assumes a linear relationship between the log-odds of features and the target variable. However,Naive Bayes does not make this assumption, and instead, it models the probability distribution of features given the class.
> + Logistic regression model is sensitive to outliers. However, Naive Bayes is less sensitive to outliers since it estimates probabilities based on feature counts.

#### Use Bernoulli Naive Bayes classifier for natural language texts
> + Bernoulli Naive Bayes is suited for binary text classifications. When we try to calssify documents into two classes such as spam vs. non spam, Bernoulli Naive Bayes treats each feature as binary, indicating whether a word is present (1) or not present (0) in the document.
> + In text classification, the vocabulary can be very large, and Bernoulli Naive Bayes handles high-dimentional data efficiently.
> + For relatively large-scale text classification tasks, Bernoulli Naive Bayes is well-suited since it is computationally efficient.
> + Bernoulli Naive Bayes are less sensitive to noisy words since it only focus on whether words are present or not.

# Part (h): N-gram model.

#### Similar to what we have done in the bag of words section, we need to iteratively test which value of m will yield the best model.

In [22]:
m_lst = list(range(1,11))
best_m = None
highest_score = -1

for i in m_lst:
    count_vect = CountVectorizer(binary = True, min_df = i, ngram_range = (2,2))
    X_train_gram = count_vect.fit_transform(X_train_preproc.text).toarray() # toarray() for better inspection
    X_develop_gram = count_vect.transform(X_develop_preproc.text).toarray()
    
    # use logistic regression with L2 regularization since it performs the best
    model = LogisticRegression(penalty = 'l2', multi_class = 'ovr')
    scores = cross_val_score(model, X_train_gram, y_train_orig, cv = 5)
    
    mean_score = scores.mean()
    
    if mean_score > highest_score:
        highest_score = mean_score
        best_m = i
        
print(f'The best m for min_df is {best_m}')
print(f'The highest score for this best_m is {highest_score}')

The best m for min_df is 2
The highest score for this best_m is 0.7361597477296551


#### Using cross validation, the best value for "min_df" is 2, which will produce the highest score when we conduct Logistic Rregression.

In [23]:
# vectorize the training set
count_vect = CountVectorizer(binary = True, min_df = best_m, ngram_range = (2,2))

# transform the text into a count matrix
X_train_gram = count_vect.fit_transform(X_train_preproc.text).toarray() # toarray() for better inspection

# Use the same count_vect object to transform the development set
X_develop_gram = count_vect.transform(X_develop_preproc.text).toarray()

#### Report the total number of 2-grams in your vocabulary

In [24]:
print(f'The total number of 2-grams in the vocabulary is: {X_train_gram.shape[1]}')

The total number of 2-grams in the vocabulary is: 4055


#### In addition, take 10 2-grams fromyour vocabulary, and print them out.

In [25]:
np.random.seed(42)
grams = np.random.choice(list(count_vect.vocabulary_.keys()), size = 10, replace = False)

print('Below are 10 randomly selected 2-grams from the vocabulary: ')
for i,j in enumerate(grams):
    print(i+1, j)

Below are 10 randomly selected 2-grams from the vocabulary: 
1 militant suicide
2 like mudslide
3 centipede press
4 brian ruebs
5 youtube playlist
6 mod showcase
7 would work
8 content http
9 avalanche http
10 sit right


#### Implement logistic regression models and the Bernoulli Naive Bayes classifier

In [26]:
#=======================================================================+#
# YOUR CODE HERE:
#  Use the functions you already defined "X_train_gram" and "X_develop_gram"
#  to re-run:
#  Logistic Regression with no regularization Model
#  Logistic Regression with L1 regularization Model
#  Logistic Regression with L2 regularization Model
#========================================================================#
# Logistic Regression with no regularization Model
y_train_gram_no_reg, y_develop_gram_no_reg = logistic_without_regularization(X_train_gram, y_train_orig, X_develop_gram, y_develop_orig)

# Logistic Regression with L1 regularization Model
y_train_gram_L1_reg, y_develop_gram_L1_reg = logistic_L1_regularization(X_train_gram, y_train_orig, X_develop_gram, y_develop_orig)

#  Logistic Regression with L2 regularization Model
y_train_gram_L2_reg, y_develop_gram_L2_reg = logistic_L2_regularization(X_train_gram, y_train_orig, X_develop_gram, y_develop_orig)

# Bernoulli Naive Bayes
nb = BernoulliNB(alpha=1)
nb.fit(X_train_gram, y_train_orig)

y_train_gram_NB = nb.predict(X_train_gram)
y_develop_gram_NB = nb.predict(X_develop_gram)
#========================================================================#
# END CODE HERE
#========================================================================#

# get the F1 train and develop scores for no regularization model
F1_train_gram_no_reg = f1_score(y_train_orig, y_train_gram_no_reg)
F1_develop_gram_no_reg = f1_score(y_develop_orig, y_develop_gram_no_reg)

# get the F1 train and develop scores for L1 regularization model
F1_train_gram_L1_reg = f1_score(y_train_orig, y_train_gram_L1_reg)
F1_develop_gram_L1_reg = f1_score(y_develop_orig, y_develop_gram_L1_reg)

# get the F1 train and develop scores for L2 regularization model
F1_train_gram_L2_reg = f1_score(y_train_orig, y_train_gram_L2_reg)
F1_develop_gram_L2_reg = f1_score(y_develop_orig, y_develop_gram_L2_reg)

# get the F1 train and develop scores for Bernoulli NB model
F1_train_gram_NB = f1_score(y_train_orig, y_train_gram_NB)
F1_develop_gram_NB = f1_score(y_develop_orig, y_develop_gram_NB)

# print the F1 train and develop scores for no regularization model
print(f"F1 for training set: {F1_train_gram_no_reg:.2f}")
print(f"F1 for development set: {F1_develop_gram_no_reg:.2f}")

print('-' * 80)

# print the F1 train and develop scores for L1 regularization model
print(f"F1 for training set: {F1_train_gram_L1_reg:.2f}")
print(f"F1 for development set: {F1_develop_gram_L1_reg:.2f}")

print('-' * 80)

# print the F1 train and develop scores for L2 regularization model
print(f"F1 for training set: {F1_train_gram_L2_reg:.2f}")
print(f"F1 for development set: {F1_develop_gram_L2_reg:.2f}")

print('-' * 80)

# print the F1 train and develop scores for Bernoulli NB model
print(f"F1 for training set: {F1_train_gram_NB:.2f}")
print(f"F1 for development set: {F1_develop_gram_NB:.2f}")

F1 for training set: 0.83
F1 for development set: 0.63
--------------------------------------------------------------------------------
F1 for training set: 0.70
F1 for development set: 0.56
--------------------------------------------------------------------------------
F1 for training set: 0.76
F1 for development set: 0.62
--------------------------------------------------------------------------------
F1 for training set: 0.75
F1 for development set: 0.62


### Observations:
> + From the above results, we can see that no matter what model we select, the F1 scores are all really low when we use 2-gram model. However, the F1 scores we get from bag of words model are much better than these results.
> + This implies that, in this problem, bag of words model is a better text representation technique and can produce better results.

# Part (i): Determine performance with the test set.

In [27]:
# we need to concatenate both train and development set
# Also, we choose the bag of word model
all_data_X = np.concatenate((X_train, X_develop), axis = 0)

all_data_y = np.concatenate((y_train_orig, y_develop_orig), axis = 0)

In [28]:
# we need to save the id column of the test data for submission
index_X_test = X_test.index

In [29]:
# Before we train the model, need to preprocess the test data as well
X_test_preproc = pre_process(X_test)

# apply the bag of word model on test data
count_vect = CountVectorizer(binary = True, min_df = 1)

# transform the text into a count matrix
X_train = count_vect.fit_transform(X_train_preproc.text).toarray() # toarray() for better inspection

# Use the same count_vect object to transform the test set
X_test = count_vect.transform(X_test_preproc.text).toarray()

In [30]:
# since the Bernoulli Naive Bayes classifier performs best in this problem, we will use it
nb = BernoulliNB(alpha=1)
nb.fit(all_data_X, all_data_y)

y_prediction = nb.predict(X_test)

In [187]:
# submission
submission = pd.DataFrame()
submission['id'] = index_X_test
submission['target'] = y_prediction

submission.to_csv('submission.csv', index = False)

### Report the resulting F1-score on the test data, as reported by Kaggle

In [188]:
print(f'The score on Kaggle is: {0.78731}')

The score on Kaggle is: 0.78731


### Was this lower or higher than you expected? Discuss why it might be lower or higher than your expectation.

The F1 score (i.e.,0.78731) on the test data is higher than my expectation since the F1 score on the developement set using bag of words model and Bernoulli Naive Bayes is 0.7588. One reason why the result is higher than my expectation could be due to the randomness when splitting the data. Since we set random_state = 42 when running the sklearn function "train_test_split", the train data and development data are fixed every time I run the function. However, if I do not set the random_state, each time I will get a different train data and development data, thus will affect the model performance and produce different F1 score. In this case, I could get a higher, lower, or approximately the same F1 score compared to the one I get from Kaggle, therefore, it is reasonable that the result I get from Kaggle is higher than my expectation.