In [38]:
import numpy as np
import pandas as pd
import spacy
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Bidirectional
from nltk.tokenize import sent_tokenize
from tensorflow.keras.regularizers import l1, l2

# Load data
data = pd.read_csv("augmented_data2_og.csv", encoding='ISO-8859-1')
with open("doc1.txt", 'r') as file:
    test_data = file.read()

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
data = data.dropna(subset=['text'])
data = data.dropna(subset=['label'])



In [39]:
# Function to extract countries using spaCy
def extract_countries(text):
    return [ent.text for ent in nlp(text).ents if ent.label_ == 'GPE']

# Preprocess text data
texts, labels = data['text'].values, data['label'].values
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=100)
y = np.array(labels)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [40]:
# Define RNN model
def create_rnn_model():
    model = Sequential([
        Embedding(len(tokenizer.word_index) + 1, 128, input_length=100),
        Bidirectional(SimpleRNN(64, return_sequences=True)),
        SimpleRNN(64),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model



In [41]:
# Train and evaluate RNN model
rnn_model = create_rnn_model()
rnn_model.fit(X_train, y_train, epochs=2, validation_data=(X_test, y_test), batch_size=32)
y_pred = (rnn_model.predict(X_test) > 0.5).astype("int32")
print("RNN Test set accuracy:", accuracy_score(y_test, y_pred))



Epoch 1/2
Epoch 2/2
RNN Test set accuracy: 1.0


In [42]:
# Classify and extract countries
def classify_and_extract(model, sentences):
    results = []
    for sentence in sentences:
        seq = tokenizer.texts_to_sequences([sentence])
        padded_seq = pad_sequences(seq, maxlen=100)
        if (model.predict(padded_seq) > 0.5).astype("int32")[0][0] == 1:
            countries = extract_countries(sentence)
            if countries:
                results.append((sentence, countries))
    return results

# Tokenize test data into sentences
sentences = sent_tokenize(test_data)

# Print classification results for RNN model
print("RNN Model Exclusion Sentences with Countries:")
for sentence, countries in classify_and_extract(rnn_model, sentences):
    print("Sentence:", sentence)
    print("Countries:", countries)
    print()


RNN Model Exclusion Sentences with Countries:
Sentence: The primary focus was on verifying the origin and involvement of vessels engaged in our trade routes, specifically ensuring that no vessels from Iran were involved in any capacity.
Countries: ['Iran']

Sentence: All Documents provided to evidence no Iran vessel involved were thoroughly examined.
Countries: ['Iran']

Sentence: Documentation confirming no Iran vessel involved in trading was collected and reviewed.
Countries: ['Iran']

Sentence: All documents consistently indicated that the vessels used were not registered in Iran nor did they originate from Iranian ports.
Countries: ['Iran']

Sentence: Proof of no Indonesia vessel involvement through submitted documents was validated by the audit team, ensuring transparency and accuracy in our reporting.
Countries: ['Indonesia']

Sentence: The comprehensive review and audit of our trading activities and associated documentation confirm that there has been no involvement of  South ko

In [43]:
import pickle
from tensorflow.keras.models import save_model

# Save the RNN model
save_model(rnn_model, 'rnn_model.h5')

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [44]:


# Define LSTM model
def create_lstm_model():
    model = Sequential([
        Embedding(len(tokenizer.word_index) + 1, 128, input_length=100),
        LSTM(64, return_sequences=True, kernel_regularizer=l2(0.01)),
        LSTM(64, kernel_regularizer=l2(0.01)),
        Dense(128, activation='relu', kernel_regularizer=l1(0.01)),
        Dropout(0.5),
        Dense(64, activation='relu', kernel_regularizer=l1(0.01)),
        Dropout(0.5),
        Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01))
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and evaluate LSTM model
lstm_model = create_lstm_model()
lstm_model.fit(X_train, y_train, epochs=2, validation_data=(X_test, y_test), batch_size=32)
y_pred = (lstm_model.predict(X_test, verbose=0) > 0.5).astype("int32")
print("LSTM Test set accuracy:", accuracy_score(y_test, y_pred))

# Save the LSTM model
save_model(lstm_model, 'lstm_model.h5')

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


Epoch 1/2
Epoch 2/2
LSTM Test set accuracy: 1.0


In [None]:
import numpy as np
import pandas as pd
import spacy
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize
from datasets import Dataset, load_metric

# Load data
data = pd.read_csv("augmented_data2_og.csv", encoding='ISO-8859-1')
with open("doc2", 'r') as file:
    test_data = file.read()

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
data = data.dropna(subset=['text', 'label'])

# Function to extract countries using spaCy
def extract_countries(text):
    return [ent.text for ent in nlp(text).ents if ent.label_ == 'GPE']

# Preprocess text data
texts, labels = data['text'].values, data['label'].values

# Tokenize data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=100)
dataset = Dataset.from_dict({
    'input_ids': encodings['input_ids'],
    'attention_mask': encodings['attention_mask'],
    'labels': labels
})

# Split dataset into training and testing sets
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    logging_first_step=True,  # To log the first step
    log_level='error'  # Suppress unnecessary logging
)

# Define metric for evaluation
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")

# Compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = (torch.sigmoid(torch.tensor(pred.predictions)) > 0.5).int().numpy().flatten()
    acc = accuracy_metric.compute(predictions=preds, references=labels)['accuracy']
    prec = precision_metric.compute(predictions=preds, references=labels)['precision']
    rec = recall_metric.compute(predictions=preds, references=labels)['recall']
    f1 = f1_metric.compute(predictions=preds, references=labels)['f1']
    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Metrics:")
for key, value in results.items():
    if key.startswith("eval_"):
        print(f"{key[5:].capitalize()}: {value:.4f}")

import pickle

# Define the path where you want to save the model
model_path_pickle = "bert_model.pkl"

# Save the model
with open(model_path_pickle, 'wb') as f:
    pickle.dump(model, f)

print("Model saved successfully at:", model_path_pickle)

# Function to classify and extract countries
def classify_and_extract(model, sentences):
    results = set()
    encodings = tokenizer(sentences, truncation=True, padding=True, max_length=100, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encodings)
    predictions = torch.sigmoid(outputs.logits).numpy().flatten()
    for sentence, prediction in zip(sentences, predictions):
        if prediction > 0.5:
            countries = extract_countries(sentence)
            if countries:
                results.add((sentence, tuple(countries)))
    return results

# Tokenize test data into sentences
sentences = sent_tokenize(test_data)


print("BERT Model Exclusion Sentences with Countries:")
for sentence, countries in classify_and_extract(model, sentences):
    print("Sentence:", sentence)
    print("Countries:", countries)
    print()

In [None]:
import pickle

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save the model
torch.save(model.state_dict(), 'bert_model.pt')


In [46]:
import pickle
import pandas as pd
import spacy
from nltk.tokenize import sent_tokenize
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split

# Read data from CSV file
file_path = "augmented_data2_og.csv"
data = pd.read_csv(file_path, encoding='ISO-8859-1')

file_path = "doc1.txt"
with open(file_path, 'r') as file:
    test_data = file.read()

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract countries from text using spaCy
def extract_countries(text):
    doc = nlp(text)
    countries = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
    return countries

data = data.dropna(subset=['text', 'label'])

# Use CountVectorizer to transform the text data into feature vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text'])
y = data['label']

# Split data into training and testing sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB()
}

# Function to evaluate and print results for each classifier
def evaluate_classifier(name, classifier, X_train, y_train, X_test, y_test):
    print(f"Classifier: {name}")
    classifier.fit(X_train, y_train)
    cv_scores = cross_val_score(classifier, X, y, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", cv_scores.mean())
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Test set accuracy:", accuracy)

    # Tokenize the test data into sentences
    sentences = sent_tokenize(test_data)

    # Initialize a list to store sentences labeled as exclusion along with the extracted countries
    exclusion_with_countries = []

    # Classify each sentence and extract countries for sentences labeled as exclusion
    for sentence in sentences:
        features = vectorizer.transform([sentence])
        classification = classifier.predict(features)[0]
        if classification == 1:  # Exclusion labels
            countries = extract_countries(sentence)
            if countries:  # Only include sentences with country names
                exclusion_with_countries.append((sentence, countries))

    # Print sentences labeled as exclusion along with extracted countries
    for sentence, countries in exclusion_with_countries:
        print("Sentence:", sentence)
        print("Countries:", countries)
        print()

    print("\n" + "="*80 + "\n")

# Evaluate and save Random Forest classifier
rf_classifier = RandomForestClassifier()
evaluate_classifier('Random Forest', rf_classifier, X_train, y_train, X_test, y_test)
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)

# Evaluate and save Naive Bayes classifier
nb_classifier = MultinomialNB()
evaluate_classifier('Naive Bayes', nb_classifier, X_train, y_train, X_test, y_test)
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(nb_classifier, f)

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


Classifier: Random Forest
Cross-validation scores: [1. 1. 1. 1. 1.]
Mean cross-validation score: 1.0
Test set accuracy: 1.0
Sentence: The primary focus was on verifying the origin and involvement of vessels engaged in our trade routes, specifically ensuring that no vessels from Iran were involved in any capacity.
Countries: ['Iran']

Sentence: All Documents provided to evidence no Iran vessel involved were thoroughly examined.
Countries: ['Iran']

Sentence: Documentation confirming no Iran vessel involved in trading was collected and reviewed.
Countries: ['Iran']

Sentence: All documents consistently indicated that the vessels used were not registered in Iran nor did they originate from Iranian ports.
Countries: ['Iran']

Sentence: Proof of no Indonesia vessel involvement through submitted documents was validated by the audit team, ensuring transparency and accuracy in our reporting.
Countries: ['Indonesia']

Sentence: The comprehensive review and audit of our trading activities and as