# PREPROCESSING

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import string
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
# Load the text file
with open("europarl_en.txt", "r") as f:
    text = f.read()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
# Tokenize the text
tokens = word_tokenize(text)

# Lowercase the tokens
tokens = [token.lower() for token in tokens]

tokens


['resumption',
 'of',
 'the',
 'session',
 'i',
 'declare',
 'resumed',
 'the',
 'session',
 'of',
 'the',
 'european',
 'parliament',
 'adjourned',
 'on',
 'friday',
 '17',
 'december',
 '1999',
 ',',
 'and',
 'i',
 'would',
 'like',
 'once',
 'again',
 'to',
 'wish',
 'you',
 'a',
 'happy',
 'new',
 'year',
 'in',
 'the',
 'hope',
 'that',
 'you',
 'enjoyed',
 'a',
 'pleasant',
 'festive',
 'period',
 '.',
 'although',
 ',',
 'as',
 'you',
 'will',
 'have',
 'seen',
 ',',
 'the',
 'dreaded',
 "'",
 'millennium',
 'bug',
 "'",
 'failed',
 'to',
 'materialise',
 ',',
 'still',
 'the',
 'people',
 'in',
 'a',
 'number',
 'of',
 'countries',
 'suffered',
 'a',
 'series',
 'of',
 'natural',
 'disasters',
 'that',
 'truly',
 'were',
 'dreadful',
 '.',
 'you',
 'have',
 'requested',
 'a',
 'debate',
 'on',
 'this',
 'subject',
 'in',
 'the',
 'course',
 'of',
 'the',
 'next',
 'few',
 'days',
 ',',
 'during',
 'this',
 'part-session',
 '.',
 'in',
 'the',
 'meantime',
 ',',
 'i',
 'should',

In [3]:
# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Stem and lemmatize the tokens
stemmed_tokens = [stemmer.stem(token) for token in tokens]
stemmed_tokens

['resumpt',
 'of',
 'the',
 'session',
 'i',
 'declar',
 'resum',
 'the',
 'session',
 'of',
 'the',
 'european',
 'parliament',
 'adjourn',
 'on',
 'friday',
 '17',
 'decemb',
 '1999',
 ',',
 'and',
 'i',
 'would',
 'like',
 'onc',
 'again',
 'to',
 'wish',
 'you',
 'a',
 'happi',
 'new',
 'year',
 'in',
 'the',
 'hope',
 'that',
 'you',
 'enjoy',
 'a',
 'pleasant',
 'festiv',
 'period',
 '.',
 'although',
 ',',
 'as',
 'you',
 'will',
 'have',
 'seen',
 ',',
 'the',
 'dread',
 "'",
 'millennium',
 'bug',
 "'",
 'fail',
 'to',
 'materialis',
 ',',
 'still',
 'the',
 'peopl',
 'in',
 'a',
 'number',
 'of',
 'countri',
 'suffer',
 'a',
 'seri',
 'of',
 'natur',
 'disast',
 'that',
 'truli',
 'were',
 'dread',
 '.',
 'you',
 'have',
 'request',
 'a',
 'debat',
 'on',
 'thi',
 'subject',
 'in',
 'the',
 'cours',
 'of',
 'the',
 'next',
 'few',
 'day',
 ',',
 'dure',
 'thi',
 'part-sess',
 '.',
 'in',
 'the',
 'meantim',
 ',',
 'i',
 'should',
 'like',
 'to',
 'observ',
 'a',
 'minut',
 "'

In [4]:
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
lemmatized_tokens


['resumption',
 'of',
 'the',
 'session',
 'i',
 'declare',
 'resumed',
 'the',
 'session',
 'of',
 'the',
 'european',
 'parliament',
 'adjourned',
 'on',
 'friday',
 '17',
 'december',
 '1999',
 ',',
 'and',
 'i',
 'would',
 'like',
 'once',
 'again',
 'to',
 'wish',
 'you',
 'a',
 'happy',
 'new',
 'year',
 'in',
 'the',
 'hope',
 'that',
 'you',
 'enjoyed',
 'a',
 'pleasant',
 'festive',
 'period',
 '.',
 'although',
 ',',
 'a',
 'you',
 'will',
 'have',
 'seen',
 ',',
 'the',
 'dreaded',
 "'",
 'millennium',
 'bug',
 "'",
 'failed',
 'to',
 'materialise',
 ',',
 'still',
 'the',
 'people',
 'in',
 'a',
 'number',
 'of',
 'country',
 'suffered',
 'a',
 'series',
 'of',
 'natural',
 'disaster',
 'that',
 'truly',
 'were',
 'dreadful',
 '.',
 'you',
 'have',
 'requested',
 'a',
 'debate',
 'on',
 'this',
 'subject',
 'in',
 'the',
 'course',
 'of',
 'the',
 'next',
 'few',
 'day',
 ',',
 'during',
 'this',
 'part-session',
 '.',
 'in',
 'the',
 'meantime',
 ',',
 'i',
 'should',
 'li

In [5]:
# Remove stop words
stop_words = set(stopwords.words("english"))
filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]

filtered_tokens

['resumption',
 'session',
 'declare',
 'resumed',
 'session',
 'european',
 'parliament',
 'adjourned',
 'friday',
 '17',
 'december',
 '1999',
 ',',
 'would',
 'like',
 'wish',
 'happy',
 'new',
 'year',
 'hope',
 'enjoyed',
 'pleasant',
 'festive',
 'period',
 '.',
 'although',
 ',',
 'seen',
 ',',
 'dreaded',
 "'",
 'millennium',
 'bug',
 "'",
 'failed',
 'materialise',
 ',',
 'still',
 'people',
 'number',
 'country',
 'suffered',
 'series',
 'natural',
 'disaster',
 'truly',
 'dreadful',
 '.',
 'requested',
 'debate',
 'subject',
 'course',
 'next',
 'day',
 ',',
 'part-session',
 '.',
 'meantime',
 ',',
 'like',
 'observe',
 'minute',
 "'",
 'silence',
 ',',
 'number',
 'member',
 'requested',
 ',',
 'behalf',
 'victim',
 'concerned',
 ',',
 'particularly',
 'terrible',
 'storm',
 ',',
 'various',
 'country',
 'european',
 'union',
 '.',
 'please',
 'rise',
 ',',
 ',',
 'minute',
 "'",
 'silence',
 '.',
 '(',
 'house',
 'rose',
 'observed',
 'minute',
 "'",
 'silence',
 ')',
 'm

In [6]:
# Remove special characters and punctuation
table = str.maketrans("", "", string.punctuation)
cleaned_tokens = [token.translate(table) for token in filtered_tokens]

# Perform POS tagging
tagged_tokens = nltk.pos_tag(cleaned_tokens)
tagged_tokens

[('resumption', 'NN'),
 ('session', 'NN'),
 ('declare', 'NN'),
 ('resumed', 'VBD'),
 ('session', 'NN'),
 ('european', 'JJ'),
 ('parliament', 'NN'),
 ('adjourned', 'VBD'),
 ('friday', 'RB'),
 ('17', 'CD'),
 ('december', 'NN'),
 ('1999', 'CD'),
 ('', 'NN'),
 ('would', 'MD'),
 ('like', 'VB'),
 ('wish', 'VB'),
 ('happy', 'JJ'),
 ('new', 'JJ'),
 ('year', 'NN'),
 ('hope', 'VBP'),
 ('enjoyed', 'VBN'),
 ('pleasant', 'JJ'),
 ('festive', 'JJ'),
 ('period', 'NN'),
 ('', 'NNP'),
 ('although', 'IN'),
 ('', 'NNP'),
 ('seen', 'VBN'),
 ('', 'NNP'),
 ('dreaded', 'VBD'),
 ('', 'NNP'),
 ('millennium', 'NN'),
 ('bug', 'NN'),
 ('', 'NNP'),
 ('failed', 'VBD'),
 ('materialise', 'NN'),
 ('', 'NN'),
 ('still', 'RB'),
 ('people', 'NNS'),
 ('number', 'NN'),
 ('country', 'NN'),
 ('suffered', 'VBD'),
 ('series', 'NN'),
 ('natural', 'JJ'),
 ('disaster', 'NN'),
 ('truly', 'RB'),
 ('dreadful', 'JJ'),
 ('', 'NN'),
 ('requested', 'VBN'),
 ('debate', 'NN'),
 ('subject', 'JJ'),
 ('course', 'NN'),
 ('next', 'JJ'),
 ('day'

# CREATING DATASET

In [1]:
import re

filename = 'europarl_en.txt'  # Update with your input file name

# Read the text file
with open(filename, 'r') as file:
    text = file.read()

# Split the text into sentences using ";" and ":" as separators
sentences = re.split(r'(?<=[;:!.?])\s', text)

# Clean the sentences by removing leading/trailing spaces and empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]



In [9]:
import re
import pandas as pd

filename = 'europarl_en.txt'  # Update with your input file name

# Read the text file
with open(filename, 'r') as file:
    text = file.read()

# Split the text into sentences using various punctuation marks as separators
sentences = re.split(r'(?<=[;:!.?])\s', text)

# Clean the sentences by removing leading/trailing spaces and empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Create the labels array with the same length as the sentences array
labels = [1] * len(sentences)

# Create the non-sentences by splitting the text without considering the punctuation marks
non_sentences = re.split(r'[;:!.?]\s', text)
non_sentences = [non_sentence.strip() for non_sentence in non_sentences if non_sentence.strip()]

# Extend the sentences and labels arrays with the non-sentences and labels 0
sentences.extend(non_sentences)
labels.extend([0] * len(non_sentences))

# Create a DataFrame with sentences and labels
data = {'Sentence': sentences, 'Label': labels}
df = pd.DataFrame(data)

# Shuffle the rows of the DataFrame
df = df.sample(frac=1, random_state=42)

# Convert the DataFrame to a CSV file
df.to_csv('sentences_labels.csv', index=True)

# Print the shuffled DataFrame
print(df)

                                               Sentence  Label
1632  But here I would like to call on my experience...      1
3892  If we can inject a spirit of entrepreneurial a...      0
2115  This is a classic example of how the partnersh...      1
2404  What conclusions has the Commission already dr...      1
3260  The White Paper also reflects the concerns of ...      1
...                                                 ...    ...
3772  Mr President , Commissioner , ah , now I see C...      0
5191  I believe that this is a report for which coop...      0
5226  Can I just make it clear that we have no commi...      0
5390  What Parliament needs to do as a matter of urg...      0
860   The Commission document expected to be ready i...      1

[6811 rows x 2 columns]


# CNN

In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import string

# Load the dataset from CSV
data = pd.read_csv('sentences_labels.csv')

# Preprocess the sentences
def preprocess_text(sentence):
    # Remove leading/trailing spaces
    sentence = sentence.strip()
    # Replace specific punctuation marks with tokens
    sentence = sentence.replace('.', ' <period>')
    sentence = sentence.replace('!', ' <exclamation>')
    sentence = sentence.replace('?', ' <question>')
    sentence = sentence.replace(';', ' <semicolon>')
    sentence = sentence.replace(':', ' <colon>')
    # Remove other punctuation marks
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    # Lowercase the sentence
    sentence = sentence.lower()
    return sentence

data['Sentence'] = data['Sentence'].apply(preprocess_text)

# Split the dataset into sentences and labels
sentences = data['Sentence'].tolist()
labels = data['Label'].tolist()

# Create a tokenizer to convert words to integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Pad the sequences
maxlen = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=maxlen)
y = np.array(labels)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the CNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss}, Test accuracy: {accuracy}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.018355771899223328, Test accuracy: 0.9963316321372986


In [18]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Predict the probabilities for each class
y_pred_prob = model.predict(X_test)

# Convert probabilities to class labels based on a threshold
y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[638   5]
 [  0 720]]


In [19]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Calculate precision, recall, F1-score, and support
report = classification_report(y_test, y_pred, labels=[0, 1])

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

# Print the classification report
print("Classification Report:")
print(report)


Confusion Matrix:
[[638   5]
 [  0 720]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       643
           1       0.99      1.00      1.00       720

    accuracy                           1.00      1363
   macro avg       1.00      1.00      1.00      1363
weighted avg       1.00      1.00      1.00      1363



In [20]:
import nltk
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')

# Preprocess custom input
input_text = "This is a custom input! It contains Mr.Ann multiple sentences ; For e. g Not all sentences end with a period Mr. Ann "
#with open('alice.txt', 'r') as file:
 #   input_text = file.read()
input_sentences = nltk.sent_tokenize(input_text)
input_sequences = []
for sentence in input_sentences:
    words = nltk.word_tokenize(sentence)
    s = []
    for word in words:
        if word.lower() in tokenizer.word_index:
            s.append(tokenizer.word_index[word.lower()] + 1)
    input_sequences.append(s)
input_sequences = pad_sequences(input_sequences, maxlen=maxlen)

# Predict sentence endings using the trained model
predictions = model.predict(input_sequences)
predicted_labels = (predictions > 0.5).astype(np.int)

# Print the predicted labels for each sentence
for sentence, label in zip(input_sentences, predicted_labels):
    if label == 1:
        print(f"{sentence.strip()}.")
    else:
        print(f"{sentence.strip()}")

This is a custom input!
It contains Mr.Ann multiple sentences ; For e. g Not all sentences end with a period Mr. Ann


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  predicted_labels = (predictions > 0.5).astype(np.int)


# BI LSTM 

In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Load the dataset from CSV
data = pd.read_csv('sentences_labels.csv')

# Preprocess the sentences
def preprocess_text(sentence):
    # Remove leading/trailing spaces
    sentence = sentence.strip()
    # Replace specific punctuation marks with tokens
    sentence = sentence.replace('.', ' <period>')
    sentence = sentence.replace('!', ' <exclamation>')
    sentence = sentence.replace('?', ' <question>')
    sentence = sentence.replace(';', ' <semicolon>')
    sentence = sentence.replace(':', ' <colon>')
    # Remove other punctuation marks
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    # Lowercase the sentence
    sentence = sentence.lower()
    return sentence

data['Sentence'] = data['Sentence'].apply(preprocess_text)

# Split the dataset into sentences and labels
sentences = data['Sentence'].tolist()
labels = data['Label'].tolist()

# Create a tokenizer to convert words to integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Pad the sequences
maxlen = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=maxlen)
y = np.array(labels)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the BiLSTM model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(128)),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss}, Test accuracy: {accuracy}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.018908241763710976, Test accuracy: 0.9963316321372986


In [22]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Predict the probabilities for each class
y_pred_prob = model.predict(X_test)

# Convert probabilities to class labels based on a threshold
y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[638   5]
 [  0 720]]


In [23]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report


# Calculate precision, recall, F1-score, and support
report = classification_report(y_test, y_pred, labels=[0, 1])


# Print the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       643
           1       0.99      1.00      1.00       720

    accuracy                           1.00      1363
   macro avg       1.00      1.00      1.00      1363
weighted avg       1.00      1.00      1.00      1363



In [24]:
import nltk
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')

# Preprocess custom input
input_text = "This is a custom input! It contains Mr.Ann multiple sentences ; For e. g Not all sentences end with a period Mr. Ann "
#with open('alice.txt', 'r') as file:
 #   input_text = file.read()
input_sentences = nltk.sent_tokenize(input_text)
input_sequences = []
for sentence in input_sentences:
    words = nltk.word_tokenize(sentence)
    s = []
    for word in words:
        if word.lower() in tokenizer.word_index:
            s.append(tokenizer.word_index[word.lower()] + 1)
    input_sequences.append(s)
input_sequences = pad_sequences(input_sequences, maxlen=maxlen)

# Predict sentence endings using the trained model
predictions = model.predict(input_sequences)
predicted_labels = (predictions > 0.5).astype(np.int)

# Print the predicted labels for each sentence
for sentence, label in zip(input_sentences, predicted_labels):
    if label == 1:
        print(f"{sentence.strip()}.")
    else:
        print(f"{sentence.strip()}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


This is a custom input!
It contains Mr.Ann multiple sentences ; For e. g Not all sentences end with a period Mr. Ann


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  predicted_labels = (predictions > 0.5).astype(np.int)


# lstm

In [26]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Load the dataset from CSV
data = pd.read_csv('sentences_labels.csv')

# Preprocess the sentences
def preprocess_text(sentence):
    # Remove leading/trailing spaces
    sentence = sentence.strip()
    # Replace specific punctuation marks with tokens
    sentence = sentence.replace('.', ' <period>')
    sentence = sentence.replace('!', ' <exclamation>')
    sentence = sentence.replace('?', ' <question>')
    sentence = sentence.replace(';', ' <semicolon>')
    sentence = sentence.replace(':', ' <colon>')
    # Remove other punctuation marks
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    # Lowercase the sentence
    sentence = sentence.lower()
    return sentence

data['Sentence'] = data['Sentence'].apply(preprocess_text)

# Split the dataset into sentences and labels
sentences = data['Sentence'].tolist()
labels = data['Label'].tolist()

# Create a tokenizer to convert words to integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Pad the sequences
maxlen = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=maxlen)
y = np.array(labels)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    LSTM(128),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss}, Test accuracy: {accuracy}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.014384440146386623, Test accuracy: 0.9985326528549194


In [27]:
import numpy as np
from sklearn.metrics import confusion_matrix

# Predict the probabilities for each class
y_pred_prob = model.predict(X_test)

# Convert probabilities to class labels based on a threshold
y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[641   2]
 [  0 720]]


In [28]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report


# Calculate precision, recall, F1-score, and support
report = classification_report(y_test, y_pred, labels=[0, 1])


# Print the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       643
           1       1.00      1.00      1.00       720

    accuracy                           1.00      1363
   macro avg       1.00      1.00      1.00      1363
weighted avg       1.00      1.00      1.00      1363



In [29]:
import nltk
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')

# Preprocess custom input
input_text = "This is a custom input! It contains Mr.Ann multiple sentences ; For e. g Not all sentences end with a period Mr. Ann "
#with open('alice.txt', 'r') as file:
 #   input_text = file.read()
input_sentences = nltk.sent_tokenize(input_text)
input_sequences = []
for sentence in input_sentences:
    words = nltk.word_tokenize(sentence)
    s = []
    for word in words:
        if word.lower() in tokenizer.word_index:
            s.append(tokenizer.word_index[word.lower()] + 1)
    input_sequences.append(s)
input_sequences = pad_sequences(input_sequences, maxlen=maxlen)

# Predict sentence endings using the trained model
predictions = model.predict(input_sequences)
predicted_labels = (predictions > 0.5).astype(np.int)

# Print the predicted labels for each sentence
for sentence, label in zip(input_sentences, predicted_labels):
    if label == 1:
        print(f"{sentence.strip()}.")
    else:
        print(f"{sentence.strip()}")

This is a custom input!
It contains Mr.Ann multiple sentences ; For e. g Not all sentences end with a period Mr. Ann


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  predicted_labels = (predictions > 0.5).astype(np.int)
