<a href="https://colab.research.google.com/github/BraydenAC/510-HIPA-AI/blob/Updated-Model/HIPA_AI_Baseline_Formatted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score

class Model:
    def __init__(self):
        # Initialize pre-trained BERT
        self.model_name = 'bert-base-uncased'
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModel.from_pretrained(self.model_name)
        self.model.eval()
        self.clf = LogisticRegression(max_iter=1000)

    def fit(self, X, y):
        # Tokenize and encode the text inputs
        inputs = self.tokenizer(X, padding=True, truncation=True, max_length=128, return_tensors='pt')
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()

        # Train the logistic regression model on the transformed data
        self.clf.fit(embeddings, y)

    def predict(self, X):
        # Transform the new text data using the trained BERT model
        inputs = self.tokenizer(X, padding=True, truncation=True, max_length=128, return_tensors='pt')
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()

        # Predict labels
        return self.clf.predict(embeddings)

# Load data
df = pd.read_csv('/content/Compiled Annotations CSV.csv', encoding='ISO-8859-1')

# Print the shape of the dataset
print(f'Dataset shape: {df.shape}')

# Extract features and labels
X = df['Features'].tolist()
y = df['Label'].tolist()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and fit the model
model_instance = Model()
model_instance.fit(X_train, y_train)

# Example prediction
predictions = model_instance.predict(X_test)
print(f'Predictions: {predictions}')

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, pos_label='Yes')

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

Dataset shape: (100, 2)




Predictions: ['Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes'
 'Yes' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'Yes']
Accuracy: 0.63
F1 Score: 0.59
