In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv("../data/corpora_dataset.csv",sep=";", encoding='utf8')

In [3]:
remove_stop_words = False
remove_punctuations = False
to_lower = False
vocab_size = 10000

In [4]:
trunc_type='post'
padding_type='post'

In [5]:
sentences = df['titles'].values
labels = df['Label'].values

training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(sentences, labels, test_size=0.2)

if remove_punctuations:
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>", lower=to_lower)
else:
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>", lower=to_lower, filters='')

tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

In [6]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=29, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=29, padding=padding_type, truncating=trunc_type)

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [7]:
X_combined = np.concatenate((training_padded, testing_padded), axis=0)
y_combined = np.concatenate((np.zeros(len(training_padded)), np.ones(len(testing_padded))), axis=0)

In [8]:
len(X_combined),len(y_combined)

(15510, 15510)

In [9]:
X_train_leak, X_test_leak, y_train_leak, y_test_leak = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

In [10]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_leak, y_train_leak)
y_pred = model.predict(X_test_leak)

report = classification_report(y_test_leak, y_pred)

In [11]:
print(report)

              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89      2491
         1.0       1.00      0.00      0.00       611

    accuracy                           0.80      3102
   macro avg       0.90      0.50      0.45      3102
weighted avg       0.84      0.80      0.72      3102



In [14]:
for i in y_pred:
    if i != 0:
        print(i)

1.0
