In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv("../data/corpora_dataset.csv",sep=";", encoding='utf8')

In [None]:
remove_stop_words = False
remove_punctuations = False
to_lower = False
vocab_size = 10000

trunc_type='post'
padding_type='post'

In [None]:
sentences = df['titles'].values
labels = df['Label'].values

training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(sentences, labels, test_size=0.2)

if remove_punctuations:
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>", lower=to_lower)
else:
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>", lower=to_lower, filters='')

tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

In [None]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=29, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=29, padding=padding_type, truncating=trunc_type)

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
X_combined = np.concatenate((training_padded, testing_padded), axis=0)
y_combined = np.concatenate((np.zeros(len(training_padded)), np.ones(len(testing_padded))), axis=0)

In [None]:
len(X_combined),len(y_combined)

In [None]:
X_train_leak, X_test_leak, y_train_leak, y_test_leak = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_leak, y_train_leak)
y_pred = model.predict(X_test_leak)

report = classification_report(y_test_leak, y_pred)

In [None]:
print(report)

In [None]:
for i in y_pred:
    if i != 0:
        print(i)