# Clickbait Detection
## Machine Learning Course Project
---
### Author: Andrea Alberti  
### Date: June 2023
---
## Data: 
The collected dataset, includes 32 000 headlines, equally
divided in the ‘clickbait’ and ‘non-clickbait’ classes. 

It is split into training, validation, and test sets consisting of 24 000, 4000 and 4000 samples, respectively. 

The data are stored in text files, with one headline for each line.


## Goal:
Build classifiers for the detection of the clickbait headlines. Consider two scenarios: accuracy oriented and fpr oriented.

---

```

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import string
import CD_functions as cdf
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics as skm
import pvml

---

### 1 - **DATA ANALYSIS**

The data are analyzed in order to understand how they are structured and how they can be handled.

Notations:

- The test_bait and train_bait files contain some headlines starting with #, so they are loaded specifying the comment character if using np.loadtxt.
- All the others files do not present the problem.

In [None]:
# Importing the dataset to check the dimensionality of the data

#CLICKBAIT (CLASS 1)
test_bait = np.loadtxt('data/clickbait_test.txt', dtype=str, delimiter='\n', comments=None)
print('test_bait: ', test_bait.shape)
train_bait = np.loadtxt('data/clickbait_train.txt', dtype=str, delimiter='\n', comments=None)
print('train_bait: ', train_bait.shape)
validation_bait = np.loadtxt('data/clickbait_validation.txt', dtype=str, delimiter='\n')
print('validation_bait: ', validation_bait.shape)

#NON-CLICKBAIT (CLASS 0)
test_nobait = np.loadtxt('data/non_clickbait_test.txt', dtype=str, delimiter='\n')
print('test_nobait: ', test_nobait.shape)
train_nobait = np.loadtxt('data/non_clickbait_train.txt', dtype=str, delimiter='\n')
print('train_nobait: ', train_nobait.shape)
validation_nobait = np.loadtxt('data/non_clickbait_validation.txt', dtype=str, delimiter='\n')
print('validation_nobait: ', validation_nobait.shape)


---

### 2 - **DATA PRE-PROCESSING**

The data are pre-processed, extracting the features from the headlines, creating the BoW representation.

Create Vocabulary:
- **Tokenization**: the headlines are split into tokens, removing the punctuation and the stopwords.

Create the BoW representation:
- **CountVectorizer**: the headlines are represented as a matrix, where each row is a headline and each column is a token. The value of each cell is the number of times the token appears in the headline.

### EXECUTE JUST ONE TIME

In [None]:
# VOCABULARY WITHOUT STOPWORDS

# Define the vocsizes
vocsizes = [100, 200, 400, 800, 1000, 2000, 4000, 8000]

# Create the vocabulary
text_list = [train_bait, train_nobait]
for size in vocsizes:
    size = str(size)
    vocabulary = cdf.create_vocabulary(text_list, int(size), 'generated_gitignore/vocabulary_stop'+size+'.txt', store=True, remove_stopwords=True)

# Generate the BoWs
for size in vocsizes:
    size = str(size)
    
    # Load the vocabulary
    vocabulary = open('generated_gitignore/vocabulary_stop'+size+'.txt', 'r').read().lower().split()

    # Create BoW
    #CLICKBAIT
    bow_train_bait = cdf.create_bow(train_bait, vocabulary, 1)
    print('bow_train_bait_stop'+size+': ', bow_train_bait.shape)
    np.savetxt('generated_gitignore/bow_train_bait_stop'+size+'.txt.gz', bow_train_bait)
    bow_test_bait = cdf.create_bow(test_bait, vocabulary, 1)
    print('bow_test_bait_stop'+size+': ', bow_test_bait.shape)
    np.savetxt('generated_gitignore/bow_test_bait_stop'+size+'.txt.gz', bow_test_bait)
    bow_validation_bait = cdf.create_bow(validation_bait, vocabulary, 1)
    print('bow_validation_bait_stop'+size+': ', bow_validation_bait.shape)
    np.savetxt('generated_gitignore/bow_validation_bait_stop'+size+'.txt.gz', bow_validation_bait)

    #NON-CLICKBAIT
    bow_train_nobait = cdf.create_bow(train_nobait, vocabulary, 0)
    print('bow_train_nobait_stop'+size+': ', bow_train_nobait.shape)
    np.savetxt('generated_gitignore/bow_train_nobait_stop'+size+'.txt.gz', bow_train_nobait)
    bow_test_nobait = cdf.create_bow(test_nobait, vocabulary, 0)
    print('bow_test_nobait_stop'+size+': ', bow_test_nobait.shape)
    np.savetxt('generated_gitignore/bow_test_nobait_stop'+size+'.txt.gz', bow_test_nobait)
    bow_validation_nobait = cdf.create_bow(validation_nobait, vocabulary, 0)
    print('bow_validation_nobait_stop'+size+': ', bow_validation_nobait.shape)
    np.savetxt('generated_gitignore/bow_validation_nobait_stop'+size+'.txt.gz', bow_validation_nobait)


In [None]:
# VOCABULARY WITH STOPWORDS

# Define the vocsizes
vocsizes = [100, 200, 400, 800, 1000, 2000, 4000, 8000]

# Create the vocabulary
text_list = [train_bait, train_nobait]
for size in vocsizes:
    size = str(size)
    vocabulary = cdf.create_vocabulary(text_list, int(size), 'generated_gitignore/vocabulary_NOstop'+size+'.txt', store=True, remove_stopwords=False)

# Generate the BoWs
for size in vocsizes:
    size = str(size)
    
    # Load the vocabulary
    vocabulary = open('generated_gitignore/vocabulary_NOstop'+size+'.txt', 'r').read().lower().split()

    # Create BoW
    #CLICKBAIT
    bow_train_bait = cdf.create_bow(train_bait, vocabulary, 1)
    print('bow_train_bait_NOstop'+size+': ', bow_train_bait.shape)
    np.savetxt('generated_gitignore/bow_train_bait_NOstop'+size+'.txt.gz', bow_train_bait)
    bow_test_bait = cdf.create_bow(test_bait, vocabulary, 1)
    print('bow_test_bait_NOstop'+size+': ', bow_test_bait.shape)
    np.savetxt('generated_gitignore/bow_test_bait_NOstop'+size+'.txt.gz', bow_test_bait)
    bow_validation_bait = cdf.create_bow(validation_bait, vocabulary, 1)
    print('bow_validation_bait_NOstop'+size+': ', bow_validation_bait.shape)
    np.savetxt('generated_gitignore/bow_validation_bait_NOstop'+size+'.txt.gz', bow_validation_bait)

    #NON-CLICKBAIT
    bow_train_nobait = cdf.create_bow(train_nobait, vocabulary, 0)
    print('bow_train_nobait_NOstop'+size+': ', bow_train_nobait.shape)
    np.savetxt('generated_gitignore/bow_train_nobait_NOstop'+size+'.txt.gz', bow_train_nobait)
    bow_test_nobait = cdf.create_bow(test_nobait, vocabulary, 0)
    print('bow_test_nobait_NOstop'+size+': ', bow_test_nobait.shape)
    np.savetxt('generated_gitignore/bow_test_nobait_NOstop'+size+'.txt.gz', bow_test_nobait)
    bow_validation_nobait = cdf.create_bow(validation_nobait, vocabulary, 0)
    print('bow_validation_nobait_NOstop'+size+': ', bow_validation_nobait.shape)
    np.savetxt('generated_gitignore/bow_validation_nobait_NOstop'+size+'.txt.gz', bow_validation_nobait)

---

### 3 - **CLASSIFICATION MODELS**

- 3.1 - **Multinomial Naive Bayes**
- 3.2 - **Logistic Regression**

#### 3.1 - **Multinomial Naive Bayes**

- **Accuracy**: for different vocabulary sizes, the accuracy is computed on the validation set.
- **Confusion Matrix**: for each vocabulary size, are computed fpr and accuracy on the validation set.
- **ROC Curve**: for each vocabulary size, are computed fpr and tpr on the validation set.
- **Lowest FPR**: for each vocabulary size, the lowest fpr is computed on the validation set and plotted with the accuracy.

#### 3.1.2 - **Stopwords Removed** (file stop)

CODE BLOCK:

>- COMPUTE THE ACCURACY ON THE VALIDATION SET FOR DIFFERENT VOCABULARY SIZES
>- COMPUTE THE ROC CURVE ON THE VALIDATION SET FOR DIFFERENT VOCABULARY SIZES
>- STORE FOR EACH VOCABULARY SIZE THE LOWEST FPR AND THE ACCURACY ON THE VALIDATION SET

In [None]:
# Define the vocsizes
vocsizes = [100, 200, 400, 800, 1000, 2000, 4000, 8000]
fprs_sizes = []
tprs_sizes = []

# Define the range for the biases
b_pos = np.linspace(-5, 5, 50)
b_neg = np.linspace(5, -5, 50)
biases = np.vstack((b_neg, b_pos)).T

# Create file where to store accs
with open('results/accuracies_mnbc_stop.csv', 'w') as f: #overwrite the file
    f.write('vocsize,accuracy_train,accuracy_validation\n')
    
# Create file where to store data for the lowest fpr
with open('results/low_fpr_mnbc_stop.csv', 'w') as f:
    f.write('vocsize,bias_neg,bias_pos,fpr,tpr,acc\n')
    
for size in vocsizes:
    size = str(size)

    # Load the BoW for training (vstack on the two classes)
    bow_train_bait = np.loadtxt('generated_gitignore/bow_train_bait_stop'+size+'.txt.gz')
    bow_train_nobait = np.loadtxt('generated_gitignore/bow_train_nobait_stop'+size+'.txt.gz')
    bow_train = np.vstack((bow_train_bait, bow_train_nobait))

    bow_validation_bait = np.loadtxt('generated_gitignore/bow_validation_bait_stop'+size+'.txt.gz')
    bow_validation_nobait = np.loadtxt('generated_gitignore/bow_validation_nobait_stop'+size+'.txt.gz')
    bow_validation = np.vstack((bow_validation_bait, bow_validation_nobait))

    # Train the multinomial Naive Bayes classifier
    w, b = pvml.multinomial_naive_bayes_train(bow_train[:, :-1], bow_train[:, -1])

    # Use the classifier to predict the labels of the validation set changing the vocabulary size
    predictions_val, scores_val = pvml.multinomial_naive_bayes_inference(bow_validation[:, :-1], w, b)
    predictions_tra, scores_tra = pvml.multinomial_naive_bayes_inference(bow_train[:, :-1], w, b)

    # Accuracy
    with open('results/accuracies_mnbc_stop.csv', 'a') as f:
        f.write(size+','+str(cdf.accuracy(predictions_tra, bow_train[:, -1]))+','+str(cdf.accuracy(predictions_val, bow_validation[:, -1]))+'\n')
    
    # Compute the ROC curve
    fprs, tprs, = cdf.roc_curve_biases(pvml.multinomial_naive_bayes_inference, bow_validation, w, biases, size, 'ROC curve MNBC for different biases (stopwords)', plot=True)
    fprs_sizes.append(fprs)
    tprs_sizes.append(tprs)
    
    # Find the biases associated with the lowest fpr
    col = np.argmin(fprs)
    pred, s = pvml.multinomial_naive_bayes_inference(bow_validation[:, :-1], w, biases[col, :])
    
    with open('results/low_fpr_mnbc_stop.csv', 'a') as f:
        f.write(f'{size},{biases[col, 0]},{biases[col, 1]},{fprs[col]},{tprs[col]},{cdf.accuracy(pred, bow_validation[:, -1])}\n')
    

# Save the FPRs (each row is a different vocabulary size, each column is a different bias. The first two rows are the biases, first negatives then positives)
fprs_arr = np.stack(fprs_sizes, axis=0)
fprs_arr = np.vstack((biases.T, fprs_arr))
np.savetxt('results/fprs_mnbc_stop.txt.gz', fprs_arr)

# Save the TPRs (each row is a different vocabulary size, each column is a different bias. The first two rows are the biases, first negatives then positives)
tprs_arr = np.stack(tprs_sizes, axis=0)
tprs_arr = np.vstack((biases.T, tprs_arr))
np.savetxt('results/tprs_mnbc_stop.txt.gz', tprs_arr)

CODE BLOCK:

>- SHOW THE ACCURACIES FOR DIFFERENT VOCABULARY SIZES

In [None]:
# Draw the plot
pd.read_csv('results/accuracies_mnbc_stop.csv').plot(x='vocsize', y=['accuracy_train', 'accuracy_validation'], kind='line', ylabel='Accuracy (%)', xlabel='Vocabulary size', title='Multinomial Naive Bayes Classifier (stopwords)')
plt.tight_layout()

CODE BLOCK:

>- SHOW THE RESULTS ABOUT THE LOWEST FPR STORED ABOVE

In [None]:
#display the results related to the lowest fpr
cdf.fpr_accs_vocsizes('results/low_fpr_mnbc_stop.csv', 'MNBC FPR and Accuracy vs. Vocsize (stopwords)')

#### 3.1.3 - **Stopwords Kept** (file NOstop)

CODE BLOCK:

>- COMPUTE THE ACCURACY ON THE VALIDATION SET FOR DIFFERENT VOCABULARY SIZES
>- COMPUTE THE ROC CURVE ON THE VALIDATION SET FOR DIFFERENT VOCABULARY SIZES
>- STORE FOR EACH VOCABULARY SIZE THE LOWEST FPR AND THE ACCURACY ON THE VALIDATION SET

In [None]:
# Define the vocsizes
vocsizes = [100, 200, 400, 800, 1000, 2000, 4000, 8000]
fprs_sizes = []
tprs_sizes = []

# Define the range for the biases
b_pos = np.linspace(-5, 5, 50) #IF THE ROC CURVE IS NOT COMPLETE, JUST EXTEND THE RANGE
b_neg = np.linspace(5, -5, 50) #IF THE ROC CURVE IS NOT COMPLETE, JUST EXTEND THE RANGE
biases = np.vstack((b_neg, b_pos)).T

# Create file where to store accs
with open('results/accuracies_mnbc_NOstop.csv', 'w') as f: #overwrite the file
    f.write('vocsize,accuracy_train,accuracy_validation\n')
    
# Create file where to store data for the lowest fpr
#with open('results/low_fpr_mnbc_NOstop.csv', 'w') as f:
#    f.write('vocsize,bias_neg,bias_pos,fpr,tpr,acc\n')
    
for size in vocsizes:
    size = str(size)

    # Load the BoW for training (vstack on the two classes)
    bow_train_bait = np.loadtxt('generated_gitignore/bow_train_bait_NOstop'+size+'.txt.gz')
    bow_train_nobait = np.loadtxt('generated_gitignore/bow_train_nobait_NOstop'+size+'.txt.gz')
    bow_train = np.vstack((bow_train_bait, bow_train_nobait))

    bow_validation_bait = np.loadtxt('generated_gitignore/bow_validation_bait_NOstop'+size+'.txt.gz')
    bow_validation_nobait = np.loadtxt('generated_gitignore/bow_validation_nobait_NOstop'+size+'.txt.gz')
    bow_validation = np.vstack((bow_validation_bait, bow_validation_nobait))

    # Train the multinomial Naive Bayes classifier
    w, b = pvml.multinomial_naive_bayes_train(bow_train[:, :-1], bow_train[:, -1])

    # Use the classifier to predict the labels of the validation set changing the vocabulary size
    predictions_val, scores_val = pvml.multinomial_naive_bayes_inference(bow_validation[:, :-1], w, b)
    predictions_tra, scores_tra = pvml.multinomial_naive_bayes_inference(bow_train[:, :-1], w, b)

    # Accuracy
    with open('results/accuracies_mnbc_NOstop.csv', 'a') as f:
        f.write(size+','+str(cdf.accuracy(predictions_tra, bow_train[:, -1]))+','+str(cdf.accuracy(predictions_val, bow_validation[:, -1]))+'\n')
    
    # Compute the ROC curve
    fprs, tprs, = cdf.roc_curve_biases(pvml.multinomial_naive_bayes_inference, bow_validation, w, biases, size, 'ROC curve MNBC for different biases', plot=True)
    fprs_sizes.append(fprs)
    tprs_sizes.append(tprs)
    
    # Find the biases associated with the lowest fpr
    col = np.argmin(fprs)
    pred, s = pvml.multinomial_naive_bayes_inference(bow_validation[:, :-1], w, biases[col, :])
    
    with open('results/low_fpr_mnbc_NOstop.csv', 'a') as f:
        f.write(f'{size},{biases[col, 0]},{biases[col, 1]},{fprs[col]},{tprs[col]},{cdf.accuracy(pred, bow_validation[:, -1])}\n')
    

# Save the FPRs (each row is a different vocabulary size, each column is a different bias. The first two rows are the biases, first negatives then positives)
fprs_arr = np.stack(fprs_sizes, axis=0)
fprs_arr = np.vstack((biases.T, fprs_arr))
np.savetxt('results/fprs_mnbc_NOstop.txt.gz', fprs_arr)

# Save the TPRs (each row is a different vocabulary size, each column is a different bias. The first two rows are the biases, first negatives then positives)
tprs_arr = np.stack(tprs_sizes, axis=0)
tprs_arr = np.vstack((biases.T, tprs_arr))
np.savetxt('results/tprs_mnbc_NOstop.txt.gz', tprs_arr)

CODE BLOCK:

>- SHOW THE ACCURACIES FOR DIFFERENT VOCABULARY SIZES

In [None]:
# Draw the plot
pd.read_csv('results/accuracies_mnbc_NOstop.csv').plot(x='vocsize', y=['accuracy_train', 'accuracy_validation'], kind='line', ylabel='Accuracy (%)', xlabel='Vocabulary size', title='Multinomial Naive Bayes Classifier')
plt.tight_layout()

CODE BLOCK:

>- SHOW THE RESULTS ABOUT THE LOWEST FPR STORED ABOVE

In [None]:
#display the results related to the lowest fpr
cdf.fpr_accs_vocsizes('results/low_fpr_mnbc_NOstop.csv', 'MNBC FPR and Accuracy vs. Vocsize')

#### 3.2 - **Logistic Regression**

CODE BLOCK:

>- SELECTION OF THE BEST LEARNING RATE AMONG A LIST OF VALUES
>- COMPUTE TRAIN ACCURACY, VALIDATION ACCURACY AND LOSS FOR EACH LEARNING RATE OF EACH DIFFERENT VOCABULARY SIZE (the vocabulary considered is just that without stopwords)

### EXECUTE JUST ONE TIME

In [None]:
# Define the vocsizes
vocsizes = [100, 200, 400, 800, 1000, 2000, 4000, 8000]
learning_rates = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1]
fprs_sizes = []
tprs_sizes = []

train_accs_lrs = []
test_accs_lrs = []
losses_lrs = []
ITC_lrs = []

'''# Create file where to store accs
with open('results/accuracies_lr.csv', 'w') as f: #overwrite the file
    f.write('vocsize,accuracy_train,accuracy_validation,learning_rate\n')'''

for size in vocsizes:
    size = str(size)

    # Load the BoW for training (vstack on the two classes)
    bow_train_bait = np.loadtxt('generated_gitignore/bow_train_bait_stop'+size+'.txt.gz')
    bow_train_nobait = np.loadtxt('generated_gitignore/bow_train_nobait_stop'+size+'.txt.gz')
    bow_train = np.vstack((bow_train_bait, bow_train_nobait))

    bow_validation_bait = np.loadtxt('generated_gitignore/bow_validation_bait_stop'+size+'.txt.gz')
    bow_validation_nobait = np.loadtxt('generated_gitignore/bow_validation_nobait_stop'+size+'.txt.gz')
    bow_validation = np.vstack((bow_validation_bait, bow_validation_nobait))
    
    # Analyze different learning rates
    for lr_ in learning_rates:
        # Train the Logistic Regression
        w, b, train_accuracies, test_accuracies, losses, ITC = cdf.logreg_training(bow_train[:, :-1], bow_train[:, -1], steps = 1000, lr = lr_, lambda_=0, X_test = bow_validation[:,:-1], Y_test = bow_validation[:,-1])
        train_accs_lrs.append(train_accuracies)
        test_accs_lrs.append(test_accuracies)
        losses_lrs.append(losses)
        ITC_lrs.append(ITC)
        
        # Use the classifier to predict the labels of the validation set changing the vocabulary size
        prob_val = cdf.logreg_inference(bow_validation[:, :-1], w, b)
        prob_tra = cdf.logreg_inference(bow_train[:, :-1], w, b)

        '''# Accuracy
        with open('results/accuracies_lr.csv', 'a') as f:
            f.write(f'{size},{cdf.accuracy((prob_tra > 0.5).astype(int), bow_train[:, -1])},{cdf.accuracy((prob_val > 0.5).astype(int), bow_validation[:, -1])},{lr_}\n')'''
        print('size: ', size, 'lr: ', lr_)
        
    print('size completed: ', size)
    
# Each row is a different learning rate, each column is an amount of iterations (multiple of 100) until convergence (tol or number of steps)
# The number of rows is 56, the first 7 are for the first vocsize, the second 7 for the second vocsize and so on

np.savetxt('results/train_accs_lrs_stop.txt.gz', np.stack(train_accs_lrs, axis=0))
np.savetxt('results/test_accs_lrs_stop.txt.gz', np.stack(test_accs_lrs, axis=0))
np.savetxt('results/losses_lrs_stop.txt.gz', np.stack(losses_lrs, axis=0))
np.savetxt('results/ITC_lrs_stop.txt.gz', ITC_lrs)


CODE BLOCK:

>- SHOW THE RESULTS ABOUT THE BEST LEARNING RATE FOR EACH VOCABULARY SIZE (the vocabulary considered is just that without stopwords)

In [None]:
learning_rates = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1]

fig, axs = plt.subplots(2, 4, figsize=(20, 10))
axs = axs.flatten()

for j, size in enumerate(vocsizes):
    data = np.loadtxt('results/losses_lrs_stop.txt.gz') #change the name to change plot. Choose among (train_accs_lrs_stop, test_accs_lrs_stop, losses_lrs_stop, ITC_lrs_stop).
    for i in range(j*7,(j+1)*7):
        axs[j].plot(np.arange(len(data[i]))*100, data[i,:], label='lr: '+str(learning_rates[i-j*7]))
        axs[j].title.set_text('vocsize: '+str(size))
        
fig.text(0.5, -0.02, 'Iterations', ha='center', va='center', fontsize=16)
fig.text(-0.01, 0.5, 'Loss', ha='center', va='center', rotation='vertical', fontsize=16)
fig.tight_layout()
lines, labels = axs[0].get_legend_handles_labels()
fig.legend(lines, labels, loc='upper right', ncols=7, bbox_to_anchor=(0.72, -0.035))
fig.text(0.5, 1.01, 'Loss vs. Iterations for different learning rates', ha='center', va='center', fontsize=16)

CODE BLOCK:

>- COMPUTE THE **PARAMETERS** FOR ALL THE VOCSIZES IN THE LIST AND STORE THEM

### EXECUTE JUST ONE TIME

In [None]:
# COMPUTE THE PARAMETERS FOR THE DIFFERENT VOCSIZES

vocsizes = [100, 200, 400, 800, 1000, 2000, 4000, 8000]
best_lr = 0.001

# Parameters for the vocabulary without stopwords
cdf.train_logreg_vocsizes(vocsizes, best_lr, 2000, 'models_trained', remove_stopwords=True)

# Parameters for the vocabulary with stopwords
cdf.train_logreg_vocsizes(vocsizes, best_lr, 2000, 'models_trained', remove_stopwords=False)

#### 3.2.2 - **Stopwords Removed** (file stop)

CODE BLOCK:

>- COMPUTE THE ACCURACY ON THE VALIDATION SET FOR DIFFERENT VOCABULARY SIZES
>- COMPUTE THE ROC CURVE ON THE VALIDATION SET FOR DIFFERENT VOCABULARY SIZES
>- STORE FOR EACH VOCABULARY SIZE THE FPR CLOSEST TO A GIVEN THRESHOLD AND THE ACCURACY ON THE VALIDATION SET

In [None]:
# Define the vocsizes
vocsizes = [100, 200, 400, 800, 1000, 2000, 4000, 8000]
fprs_sizes = []
tprs_sizes = []
fpr_threshold = -0.1

# Define the range for the biases
biases = np.linspace(-8,8,1000) #IF THE ROC CURVE IS NOT COMPLETE, JUST EXTEND THE RANGE

# Create file where to store accs
with open('results/accuracies_lr_stop.csv', 'w') as f: #overwrite the file
    f.write('vocsize,accuracy_train,accuracy_validation\n')
    
# Create file where to store data for the lowest fpr
with open('results/low_fpr_lr_stop.csv', 'w') as f:
    f.write('vocsize,bias,fpr,tpr,acc\n')
    
for size in vocsizes:
    size = str(size)

    # Load the BoW for training (vstack on the two classes)
    bow_train_bait = np.loadtxt('generated_gitignore/bow_train_bait_stop'+size+'.txt.gz')
    bow_train_nobait = np.loadtxt('generated_gitignore/bow_train_nobait_stop'+size+'.txt.gz')
    bow_train = np.vstack((bow_train_bait, bow_train_nobait))

    bow_validation_bait = np.loadtxt('generated_gitignore/bow_validation_bait_stop'+size+'.txt.gz')
    bow_validation_nobait = np.loadtxt('generated_gitignore/bow_validation_nobait_stop'+size+'.txt.gz')
    bow_validation = np.vstack((bow_validation_bait, bow_validation_nobait))

    # Load the Logistic Regression parameters
    params = np.load('models_trained/param_logreg_stop'+size+'.npz')
    w = params['w']
    b = params['b']

    # Use the classifier to predict the labels of the validation set changing the vocabulary size
    probs_val = cdf.logreg_inference(bow_validation[:, :-1], w, b)
    predictions_val = (probs_val > 0.5).astype(int)
    probs_tra = cdf.logreg_inference(bow_train[:, :-1], w, b)
    predictions_tra = (probs_tra > 0.5).astype(int)

    # Accuracy
    with open('results/accuracies_lr_stop.csv', 'a') as f:
        f.write(size+','+str(cdf.accuracy(predictions_tra, bow_train[:, -1]))+','+str(cdf.accuracy(predictions_val, bow_validation[:, -1]))+'\n')
    
    # Compute the ROC curve
    fprs, tprs, = cdf.roc_curve_biases(cdf.logreg_inference, bow_validation, w, biases, size, 'ROC curve LR for different biases (stopwords)', plot=True)
    fprs_sizes.append(fprs)
    tprs_sizes.append(tprs)
    
    # Find the biases associated with the lowest fpr
    fprs_tmp = np.array(fprs)
    col = np.where(fprs_tmp>fpr_threshold)[0][0]
    prob = cdf.logreg_inference(bow_validation[:, :-1], w, biases[col])
    pred = (prob > 0.5).astype(int)
    
    with open('results/low_fpr_lr_stop.csv', 'a') as f:
        f.write(f'{size},{biases[col]},{fprs[col]},{tprs[col]},{cdf.accuracy(pred, bow_validation[:, -1])}\n')
    

# Save the FPRs (each row is a different vocabulary size, each column is a different bias. The first row contains the biases.
fprs_arr = np.stack(fprs_sizes, axis=0)
fprs_arr = np.vstack((biases, fprs_arr))
np.savetxt('results/fprs_lr_stop.txt.gz', fprs_arr)

# Save the TPRs (each row is a different vocabulary size, each column is a different bias. The first row contains the biases.
tprs_arr = np.stack(tprs_sizes, axis=0)
tprs_arr = np.vstack((biases, tprs_arr))
np.savetxt('results/tprs_lr_stop.txt.gz', tprs_arr)

CODE BLOCK:

>- SHOW THE ACCURACIES FOR DIFFERENT VOCABULARY SIZES

In [None]:
# Draw the plot
pd.read_csv('results/accuracies_lr_stop.csv').plot(x='vocsize', y=['accuracy_train', 'accuracy_validation'], kind='line', ylabel='Accuracy (%)', xlabel='Vocabulary size', title='Logistic Regression (stopwords)')
plt.tight_layout()

CODE BLOCK:

>- SHOW THE RESULTS ABOUT THE LOWEST FPR STORED ABOVE

In [None]:
#display the results related to the lowest fpr
cdf.fpr_accs_vocsizes('results/low_fpr_lr_stop.csv', 'LR FPR and Accuracy vs. Vocsize (stopwords)')

#### 3.2.3 - **Stopwords Kept** (file NOstop)

CODE BLOCK:

>- COMPUTE THE ACCURACY ON THE VALIDATION SET FOR DIFFERENT VOCABULARY SIZES
>- COMPUTE THE ROC CURVE ON THE VALIDATION SET FOR DIFFERENT VOCABULARY SIZES
>- STORE FOR EACH VOCABULARY SIZE THE FPR CLOSEST TO A GIVEN THRESHOLD AND THE ACCURACY ON THE VALIDATION SET

In [None]:
# Define the vocsizes
vocsizes = [100, 200, 400, 800, 1000, 2000, 4000, 8000]
fprs_sizes = []
tprs_sizes = []
fpr_threshold = -0.1

# Define the range for the biases
biases = np.linspace(-8,8,1000) #IF THE ROC CURVE IS NOT COMPLETE, JUST EXTEND THE RANGE

# Create file where to store accs
with open('results/accuracies_lr_NOstop.csv', 'w') as f: #overwrite the file
    f.write('vocsize,accuracy_train,accuracy_validation\n')
    
# Create file where to store data for the lowest fpr
#with open('results/low_fpr_lr_NOstop.csv', 'w') as f:
#    f.write('vocsize,bias,fpr,tpr,acc\n')
    
for size in vocsizes:
    size = str(size)

    # Load the BoW for training (vstack on the two classes)
    bow_train_bait = np.loadtxt('generated_gitignore/bow_train_bait_NOstop'+size+'.txt.gz')
    bow_train_nobait = np.loadtxt('generated_gitignore/bow_train_nobait_NOstop'+size+'.txt.gz')
    bow_train = np.vstack((bow_train_bait, bow_train_nobait))

    bow_validation_bait = np.loadtxt('generated_gitignore/bow_validation_bait_NOstop'+size+'.txt.gz')
    bow_validation_nobait = np.loadtxt('generated_gitignore/bow_validation_nobait_NOstop'+size+'.txt.gz')
    bow_validation = np.vstack((bow_validation_bait, bow_validation_nobait))

    # Load the Logistic Regression parameters
    params = np.load('models_trained/param_logreg_NOstop'+size+'.npz')
    w = params['w']
    b = params['b']

    # Use the classifier to predict the labels of the validation set changing the vocabulary size
    probs_val = cdf.logreg_inference(bow_validation[:, :-1], w, b)
    predictions_val = (probs_val > 0.5).astype(int)
    probs_tra = cdf.logreg_inference(bow_train[:, :-1], w, b)
    predictions_tra = (probs_tra > 0.5).astype(int)

    # Accuracy
    with open('results/accuracies_lr_NOstop.csv', 'a') as f:
        f.write(size+','+str(cdf.accuracy(predictions_tra, bow_train[:, -1]))+','+str(cdf.accuracy(predictions_val, bow_validation[:, -1]))+'\n')
    
    # Compute the ROC curve
    fprs, tprs, = cdf.roc_curve_biases(cdf.logreg_inference, bow_validation, w, biases, size, 'ROC curve LR for different biases', plot=True)
    fprs_sizes.append(fprs)
    tprs_sizes.append(tprs)
    
    # Find the biases associated with the lowest fpr
    fprs_tmp = np.array(fprs)
    col = np.where(fprs_tmp>fpr_threshold)[0][0]
    prob = cdf.logreg_inference(bow_validation[:, :-1], w, biases[col])
    pred = (prob > 0.5).astype(int)
    
    with open('results/low_fpr_lr_NOstop.csv', 'a') as f:
        f.write(f'{size},{biases[col]},{fprs[col]},{tprs[col]},{cdf.accuracy(pred, bow_validation[:, -1])}\n')
    

# Save the FPRs (each row is a different vocabulary size, each column is a different bias. The first row contains the biases.
fprs_arr = np.stack(fprs_sizes, axis=0)
fprs_arr = np.vstack((biases, fprs_arr))
np.savetxt('results/fprs_lr_NOstop.txt.gz', fprs_arr)

# Save the TPRs (each row is a different vocabulary size, each column is a different bias. The first row contains the biases.
tprs_arr = np.stack(tprs_sizes, axis=0)
tprs_arr = np.vstack((biases, tprs_arr))
np.savetxt('results/tprs_lr_NOstop.txt.gz', tprs_arr)

CODE BLOCK:

>- COMPUTE THE **PARAMETERS** FOR ALL THE VOCSIZES IN THE LIST AND STORE THEM

In [None]:
# Draw the plot
pd.read_csv('results/accuracies_lr_NOstop.csv').plot(x='vocsize', y=['accuracy_train', 'accuracy_validation'], kind='line', ylabel='Accuracy (%)', xlabel='Vocabulary size', title='Logistic Regression')
plt.tight_layout()

CODE BLOCK:

>- SHOW THE RESULTS ABOUT THE LOWEST FPR STORED ABOVE

In [None]:
#display the results related to the lowest fpr
cdf.fpr_accs_vocsizes('results/low_fpr_lr_NOstop.csv', 'LR FPR and Accuracy vs. Vocsize')

---

### 4 - **ANALYSIS OF THE BEST MODEL**

- 4.1 - **Highest Accuracy:** MNBC without stopwords removed and vocabulary size 8000
- 4.2 - **Lowest FPR:** MNBC without stopwords removed and vocabulary size 2000 biases = [5,-5]

In [None]:
# Compare the classifiers with respect to their accuracy on validation set

df1 = pd.read_csv('results/accuracies_mnbc_stop.csv')
df2 = pd.read_csv('results/accuracies_mnbc_NOstop.csv')
df3 = pd.read_csv('results/accuracies_lr_stop.csv')
df4 = pd.read_csv('results/accuracies_lr_NOstop.csv')
df = df1.join(df2, lsuffix='_mnbc_stop', rsuffix='_mnbc_NOstop')
df = df.join(df3, rsuffix='_lr_stop')
df = df.join(df4, rsuffix='_lr_NOstop')
df = df.drop(columns=['vocsize', 'vocsize_lr_NOstop', 'vocsize_mnbc_NOstop'])
df = df.rename(columns={'accuracy_train': 'accuracy_train_lr_stop', 'accuracy_validation':'accuracy_validation_lr_stop'})
df.plot(kind='bar', x='vocsize_mnbc_stop', y=['accuracy_validation_mnbc_stop', 'accuracy_validation_mnbc_NOstop', 'accuracy_validation_lr_stop', 'accuracy_validation_lr_NOstop'], ylabel='Accuracy (%)', xlabel='Vocabulary size', title='Comparison of the classifiers')
plt.legend(bbox_to_anchor=(1.08, -0.115), ncols = 2)
plt.xticks(rotation=0)


# Compare the classifiers with respect to their trade-off between accuracy and FPR (False Positive Rate) on validation set 
# The best is the MNBC with the stopwords kept and the results for the vocsizes are shown below

#display the results related to the lowest fpr
cdf.fpr_accs_vocsizes('results/low_fpr_mnbc_NOstop.csv', 'MNBC FPR and Accuracy vs. Vocsize')

#### 4.1 - **Highest Accuracy model**

In [None]:
# Load the features, the vocabulary and train the model once for all

#load vocabulary
voc_stopwords = np.loadtxt('generated_gitignore/vocabulary_NOstop8000.txt', dtype=str)

#getting features 
bow1 = np.loadtxt('generated_gitignore/bow_test_bait_NOstop8000.txt.gz')
bow2 = np.loadtxt('generated_gitignore/bow_test_nobait_NOstop8000.txt.gz')
bow_test = np.vstack((bow1, bow2))

bow1 = np.loadtxt('generated_gitignore/bow_train_bait_NOstop8000.txt.gz')
bow2 = np.loadtxt('generated_gitignore/bow_train_nobait_NOstop8000.txt.gz')
bow_train = np.vstack((bow1, bow2))

#training algorithm 
w, b = pvml.multinomial_naive_bayes_train(bow_train[:, :-1], bow_train[:, -1])


CODE BLOCK:

>- IDENTIFY THE MOST IMPACTFUL WORDS

> The most impactful words for a class are computed as the words probability gap between the two classes is the largest.

In [None]:
#get the indices of the sorted array
delta = w[:,0] - w[:,1]
indices = delta.argsort()[::-1]

#print results and store results for easier insertion in latex
with open('results/impactful_words_mnbc_NOstop8000.txt', 'w') as f:
    print("NEGATIVE CLASS", file = f)
    for i in indices[:10]:
        print(voc_stopwords[i], delta[i], file = f)

    print('', file = f)
    print("POSITIVE CLASS", file = f)
    for i in indices[-1:-11:-1]:
        print(voc_stopwords[i], delta[i], file = f)


CODE BLOCK:

>- INDENTIFY THE WORST ERRORS

>The worst errors are the misclassified headlines whose score gap between the two classes is the largest.

In [None]:
# list of the headlines
hlines = open('data/clickbait_test.txt').read().strip().split('\n')
hlines_nobait = open('data/non_clickbait_test.txt').read().strip().split('\n')
hlines.extend(hlines_nobait)

# Use the classifier to predict the labels of the validation set changing the vocabulary size
predictions_test, scores_test = pvml.multinomial_naive_bayes_inference(bow_test[:, :-1], w, b)

delta = scores_test[:,1] - scores_test[:,0]
c = pd.DataFrame({'headlines': hlines, 'delta': delta})
c_full = c.copy()
c_corr = c[predictions_test == bow_test[:, -1]]
c = c[predictions_test != bow_test[:, -1]]
c.sort_values(by='delta', ascending=False, inplace=True)
false_positive = c.head(10)
false_negative = c.tail(10)
false_positive.plot(x = 'headlines', y = 'delta', kind = 'bar', title = 'False positive headlines (MNBC 8000)', ylabel = 'Score', xlabel = 'headline content', legend = False)
false_negative.plot(x = 'headlines', y = 'delta', kind = 'bar', title = 'False negative headlines (MNBC 8000)', ylabel = 'Score', xlabel = 'headline content', legend = False)

# Compute avg length of the headlines misclassified and compare it with the avg length of the headlines
c['length'] = c['headlines'].apply(lambda x: len(x.split()))
print('Avg length of misclassified: ', c['length'].mean())
c_full['length'] = c_full['headlines'].apply(lambda x: len(x.split()))
print('Avg length of all headlines: ', c_full['length'].mean())
c_corr['length'] = c_corr['headlines'].apply(lambda x: len(x.split()))
print('Avg length of correctly classified: ', c_corr['length'].mean())

In [None]:
plt.bar(['misclassified', 'all', 'correctly'], [c['length'].mean(), c_full['length'].mean(), c_corr['length'].mean()], align='center', width=0.5)
plt.title('Avg length of headlines')
plt.ylabel('Avg length')
plt.tight_layout()
plt.show()

CODE BLOCK:

>- FINAL PRINT OF THE ACCURACIES ON THE TEST SET

In [None]:
# Compute accuracies of test and training
predictions_test, scores_test = pvml.multinomial_naive_bayes_inference(bow_test[:, :-1], w, b)
predictions_train, scores_train = pvml.multinomial_naive_bayes_inference(bow_train[:, :-1], w, b)

print('Test accuracy: ', cdf.accuracy(predictions_test, bow_test[:, -1]))
print('Train accuracy: ', cdf.accuracy(predictions_train, bow_train[:, -1]))

#### 4.1 - **Lowest FPR model**

In [None]:
# Load the features, the vocabulary and train the model once for all

#load vocabulary
voc_stopwords = np.loadtxt('generated_gitignore/vocabulary_NOstop2000.txt', dtype=str)

#getting features 
bow1 = np.loadtxt('generated_gitignore/bow_test_bait_NOstop2000.txt.gz')
bow2 = np.loadtxt('generated_gitignore/bow_test_nobait_NOstop2000.txt.gz')
bow_test = np.vstack((bow1, bow2))

bow1 = np.loadtxt('generated_gitignore/bow_train_bait_NOstop2000.txt.gz')
bow2 = np.loadtxt('generated_gitignore/bow_train_nobait_NOstop2000.txt.gz')
bow_train = np.vstack((bow1, bow2))

#training algorithm 
w, b = pvml.multinomial_naive_bayes_train(bow_train[:, :-1], bow_train[:, -1])
b = [5, -5] #set the optimal biases for the lowest fpr

CODE BLOCK:

>- IDENTIFY THE MOST IMPACTFUL WORDS

> The most impactful words for a class are computed as the words whose score gap between the two classes is the largest.

In [None]:
#get the indices of the sorted array
delta = w[:,0] - w[:,1]
indices = delta.argsort()[::-1]

#print results and store results for easier insertion in latex
with open('results/impactful_words_mnbc_NOstop2000.txt', 'w') as f:
    print("NEGATIVE CLASS", file = f)
    for i in indices[:10]:
        print(voc_stopwords[i], delta[i], file = f)

    print('', file = f)
    print("POSITIVE CLASS", file = f)
    for i in indices[-1:-11:-1]:
        print(voc_stopwords[i], delta[i], file = f)

CODE BLOCK:

>- INDENTIFY THE WORST ERRORS

>The worst errors are the misclassified headlines whose score gap between the two classes is the largest.

In [None]:
# list of the headlines
hlines = open('data/clickbait_test.txt').read().strip().split('\n')
hlines_nobait = open('data/non_clickbait_test.txt').read().strip().split('\n')
hlines.extend(hlines_nobait)

# Use the classifier to predict the labels of the validation set changing the vocabulary size
predictions_test, scores_test = pvml.multinomial_naive_bayes_inference(bow_test[:, :-1], w, b)

delta = scores_test[:,1] - scores_test[:,0]
c = pd.DataFrame({'headlines': hlines, 'delta': delta})
c = c[predictions_test != bow_test[:, -1]]
c.sort_values(by='delta', ascending=False, inplace=True)
false_positive = c.head(10)
false_negative = c.tail(10)
false_positive.plot(x = 'headlines', y = 'delta', kind = 'bar', title = '*False positive* headlines (MNBC 2000)', ylabel = 'Score', xlabel = 'headline content', legend = False)
false_negative.plot(x = 'headlines', y = 'delta', kind = 'bar', title = 'False negative headlines (MNBC 2000)', ylabel = 'Score', xlabel = 'headline content', legend = False)

# Compute avg length of the headlines misclassified and compare it with the avg length of the headlines
c['length'] = c['headlines'].apply(lambda x: len(x.split()))
print('Avg length of misclassified: ', c['length'].mean())
c_full['length'] = c_full['headlines'].apply(lambda x: len(x.split()))
print('Avg length of all headlines: ', c_full['length'].mean())
c_corr['length'] = c_corr['headlines'].apply(lambda x: len(x.split()))
print('Avg length of correctly classified: ', c_corr['length'].mean())

In [None]:
plt.bar(['misclassified', 'all', 'correctly'], [c['length'].mean(), c_full['length'].mean(), c_corr['length'].mean()], align='center', width=0.5)
plt.title('Avg length of headlines')
plt.ylabel('Avg length')
plt.tight_layout()
plt.show()

CODE BLOCK:

>- FINAL PRINT OF THE ACCURACIES ON THE TEST SET AND THE FPR

In [None]:
# Compute accuracies of test and training
predictions_test, scores_test = pvml.multinomial_naive_bayes_inference(bow_test[:, :-1], w, b)
predictions_train, scores_train = pvml.multinomial_naive_bayes_inference(bow_train[:, :-1], w, b)

print('Test accuracy: ', cdf.accuracy(predictions_test, bow_test[:, -1]))
print('Train accuracy: ', cdf.accuracy(predictions_train, bow_train[:, -1]))

confmat = skm.confusion_matrix(bow_test[:, -1], predictions_test)
tn, fp, fn, tp = confmat.ravel()
print('The FPR (%) is: ', fp/(fp+tn)*100)
print('The TPR (%) is: ', tp/(tp+fn)*100)

---

### **DETAILS**: Deep Training of the Logistic Regression model

Train the potentially best model with respect to the accuracy and analyze its accuracy and fpr on the test set: LR without stopwords and vocsize 4000.

In [None]:
# Load the BoW for training (vstack on the two classes)
size = str(4000)
bow1 = np.loadtxt('generated_gitignore/bow_train_bait_NOstop'+size+'.txt.gz')
bow2 = np.loadtxt('generated_gitignore/bow_train_nobait_NOstop'+size+'.txt.gz')
bow_train = np.vstack((bow1, bow2))

bow1 = np.loadtxt('generated_gitignore/bow_test_bait_NOstop'+size+'.txt.gz')
bow2 = np.loadtxt('generated_gitignore/bow_test_nobait_NOstop'+size+'.txt.gz')
bow_test = np.vstack((bow1, bow2))

In [None]:
steps_ = 20000 #about 10x larger than the number of iterations needed to see a graphically rough converge

# Train the Logistic Regression
w, b, train_accuracies, test_accuracies, losses, iter_to_conv = cdf.logreg_training(bow_train[:, :-1], bow_train[:, -1], steps_, 0.001, 0.2, tol=0.0000005, X_test=bow_test[:, :-1], Y_test=bow_test[:, -1])

# Save the parameters
np.savez('models_trained/param_logreg_NOstop'+size+'_deeptrain.npz', w=w, b=b)

In [None]:
accs = []
accs.append(train_accuracies)
accs.append(test_accuracies)
accs.append(losses)
np.savetxt('results/train_test_losses_NOstop'+size+'_deeptrain.txt.gz', np.stack(accs, axis=0))

In [None]:
results = np.loadtxt('results/train_test_losses_NOstop4000_deeptrain.txt.gz')
train_accuracies = results[0,:]
test_accuracies = results[1,:]
losses = results[2,:]

plt.plot(train_accuracies*100, label='train')
plt.plot(test_accuracies*100, label='test')
plt.title('Accuracy vs. iterations LR 4000')
plt.ylabel('Accuracy (%)')
plt.xlabel('iterations (x100)')
plt.legend()
plt.show()

plt.plot(losses, label='loss')
plt.ylabel('loss')
plt.xlabel('iterations (x100)')
plt.title('Loss vs. iterations LR 4000')
plt.show()

#print('The loss threshold has been reached at iteration: ',iter_to_conv)