In [2]:
import pandas as pd
from sqlalchemy import create_engine
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Connect to the PostgreSQL database
db_url = "postgresql://niphemi.oyewole:W7bHIgaN1ejh@ep-delicate-river-a5cq94ee-pooler.us-east-2.aws.neon.tech/Vetassist"
engine = create_engine(db_url)

# Step 1: Get the total number of rows in the table
total_rows_query = "SELECT COUNT(*) FROM reddit_usernames_comments;"
total_rows = engine.execute(total_rows_query).scalar()

# Step 2: Calculate the offset to get the last 2000 rows
offset = max(0, total_rows - 2000)

# Step 3: Retrieve the last 2000 rows using OFFSET and LIMIT
query = f"""
SELECT username, comments
FROM reddit_usernames_comments
OFFSET {offset} LIMIT 2000;
"""
df = pd.read_sql(query, engine)

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text using LLM
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing to comment_text column
df['clean_comment'] = df['comments'].apply(preprocess_text)

# Function to classify comments into labels after preprocessing it
def classify_comment(text):
    if 'doctor' in text or 'consultants to doctors' in text:
        return 'Medical Doctor'
    elif 'veterinarian' in text or 'consultants to vet' in text:
        return 'Veterinarian'
    else:
        return 'Other'

# Apply classification function to create label column
df['label'] = df['comments'].apply(classify_comment)


# Vectorize the cleaned text
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['clean_comment'])

# Example labels (replace with your actual labels)
y = df['label']  # 'label' should be your column with labeled categories

# Split data into training and evaluation sets
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SVM classifier
clf = SVC(kernel='linear')

# Train the classifier
clf.fit(X_train, y_train)

# Predict on the evaluation set
y_pred = clf.predict(X_eval)

# Evaluate performance
accuracy = accuracy_score(y_eval, y_pred)
report = classification_report(y_eval, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

# New comment to classify
new_comment = "My friend visited me where I am working as a medical doctor"

# Preprocess and vectorize the new comment
new_comment_cleaned = preprocess_text(new_comment)
new_comment_vectorized = vectorizer.transform([new_comment_cleaned])

# Predict the label for the new comment
new_comment_pred = clf.predict(new_comment_vectorized)

# Print the prediction
print(f"The predicted label for the new comment is: {new_comment_pred[0]}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chukwuemeka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chukwuemeka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.96
Classification Report:
                precision    recall  f1-score   support

Medical Doctor       1.00      0.69      0.82        26
         Other       0.96      0.99      0.98       365
  Veterinarian       0.60      0.33      0.43         9

      accuracy                           0.96       400
     macro avg       0.85      0.67      0.74       400
  weighted avg       0.96      0.96      0.96       400

The predicted label for the new comment is: Medical Doctor
