In [1]:
import pandas as pd

# Load the Excel file
file_path = 'Gating_Mechanism_Data.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,Prompt,Label
0,What is the source of the influenza virus enve...,Medical
1,How does the yellow fever virus acquire its en...,Medical
2,Why is the envelope of the varicella zoster vi...,Medical
3,"What type of virus is parvovirus B19, which ca...",Medical
4,From where does the Ebola virus derive its env...,Medical


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Preprocessing: Lowercasing the prompts
data['Prompt'] = data['Prompt'].str.lower()

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Initialize a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data and transform the testing data
X_train = tfidf_vectorizer.fit_transform(train_data['Prompt'])
X_test = tfidf_vectorizer.transform(test_data['Prompt'])

# The labels
y_train = train_data['Label']
y_test = test_data['Label']

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

accuracy, precision, recall, f1


(0.9968817462221156, 0.9969008495675974, 0.9968817462221156, 0.996881601585327)

In [3]:
# Load the provided Excel file for testing
test_medical_file_path = 'Test_USMLE.xlsx'
test_medical_data = pd.read_excel(test_medical_file_path)

# Extracting the text from the 'question' column
test_medical_data.head()


Unnamed: 0,Question,Answer
0,The 'fight or flight' response triggers the re...,The 'fight or flight' response triggers the re...
1,A 29-year-old woman is brought to the physicia...,A - Schizophrenia Explanation Why\nSchizophren...
2,A 45-year-old man is admitted to the hospital ...,"C-\n""The result is back but I need to discuss ..."
3,A 67-year-old man comes to the physician becau...,Patients with advanced silicosis are at increa...
4,A 33-year-old woman comes to the physician bec...,Dextromethorphan is an NMDA glutamate receptor...


In [4]:
# Preprocessing the 'Question' column: lowercasing
test_medical_data['Question'] = test_medical_data['Question'].str.lower()

# Transform the 'Question' column using the TF-IDF Vectorizer
X_medical_test = tfidf_vectorizer.transform(test_medical_data['Question'])

# Predict the labels for the medical test data
medical_test_predictions = model.predict(X_medical_test)
medical_test_predictions.tolist()


['Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical',
 'Medical']

In [5]:
# Load the provided Excel file for testing the code data
test_code_file_path = 'Code Test.xlsx'
test_code_data = pd.read_excel(test_code_file_path)

# Extracting the text from the 'User' column
test_code_data.head()


Unnamed: 0,User,AI
0,"Power of Four\nGiven an integer n, return true...","To check if an integer `n` is a power of four,..."
1,Longest Palindromic Substring\nGiven a string ...,To find the longest palindromic substring in a...
2,"Given an integer n, your task is to count how ...","To solve this problem, we can use dynamic prog..."
3,Poor Pigs\nThere are buckets buckets of liquid...,"To solve this problem, we need to think about ..."
4,Find Mode in Binary Search Tree\nGiven the roo...,To find the mode(s) in a BST without using ext...


In [7]:
# Handling NaN values in the 'User' column by replacing them with empty strings
test_code_data['User'].fillna("", inplace=True)

# Transforming the 'User' column again using the TF-IDF Vectorizer
X_code_test = tfidf_vectorizer.transform(test_code_data['User'])

# Predict the labels for the code test data
code_test_predictions = model.predict(X_code_test)
code_test_predictions.tolist()


['Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Medical',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code',
 'Code']