In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import re
import string
import emoji
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv('train_data.csv')

In [3]:
df.head

<bound method NDFrame.head of                                               comments           Label
0    Currently dealing with them, left 13 months in...  Medical Doctor
1    Do you mind sharing how many years out you are...  Medical Doctor
2    I'm a physician, my spouse is a vet so I have ...  Medical Doctor
3    My thoughts exactly - human physician here. Ev...  Medical Doctor
4    Quit. Leave. Don’t show up. They will figure o...  Medical Doctor
..                                                 ...             ...
117  What's the solution to Mysterium VPN wrecking ...           Other
118  Cool but What about the original decentralised...           Other
119  Fellow european here. I would guess it greatly...           Other
120  Anyone who's made it to 4th year can be a vete...           Other
121  i ended up calling the vet clinic that partner...           Other

[122 rows x 2 columns]>

In [4]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


# Preprocessing function
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from text
    text = re.sub(r'\@\w+|\#','', text)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove mutiple spaces with single space
    text = re.sub(r'\s+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Remove leading/trailing whitespace
    text = text.strip()  
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)

    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\femibewaji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Apply preprocessing to the comments
df['comments'] = df['comments'].apply(preprocess_text)

In [6]:
# Check for and drop rows with missing target values and comments
df = df.dropna(subset=['comments', 'Label'])

# Verify no NaNs in the dataset
print("Number of NaNs in the dataset after dropping: ", df.isnull().sum().sum())

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(df['comments'], df['Label'], test_size=0.2, random_state=42)

# Verify no NaNs in the target variable after split
print("Number of NaNs in y_train: ", y_train.isnull().sum())
print("Number of NaNs in y_test: ", y_test.isnull().sum())

# Check for NaN values in y_train and y_test and fill them with 'other'
y_train = y_train.fillna("other")
y_test = y_test.fillna("other")

# Ensure there are no NaNs in the labels
print("Number of NaNs in y_train after filling: ", y_train.isnull().sum())
print("Number of NaNs in y_test after filling: ", y_test.isnull().sum())

# Convert labels to integers
label_map = {'Medical Doctor': 0, 'Veterinarian': 1, 'Other': 2}
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)

# Identify unexpected labels
unexpected_labels_train = y_train[y_train.isna()]
unexpected_labels_test = y_test[y_test.isna()]

print("Unexpected labels in y_train:", unexpected_labels_train)
print("Unexpected labels in y_test:", unexpected_labels_test)

# Fill NaNs resulting from unexpected labels with the value for 'other'
y_train = y_train.fillna(label_map['Other'])
y_test = y_test.fillna(label_map['Other'])

# Verify no NaNs in the mapped labels
print("Number of NaNs in y_train after mapping: ", y_train.isnull().sum())
print("Number of NaNs in y_test after mapping: ", y_test.isnull().sum())

# Convert to integer arrays
y_train = y_train.astype(int).values
y_test = y_test.astype(int).values


Number of NaNs in the dataset after dropping:  0
Number of NaNs in y_train:  0
Number of NaNs in y_test:  0
Number of NaNs in y_train after filling:  0
Number of NaNs in y_test after filling:  0
Unexpected labels in y_train: Series([], Name: Label, dtype: int64)
Unexpected labels in y_test: Series([], Name: Label, dtype: int64)
Number of NaNs in y_train after mapping:  0
Number of NaNs in y_test after mapping:  0


In [7]:
# Logistic Regression Model
# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [8]:
# Handle class imbalance in training data
smote = SMOTE(random_state=42)
X_train_tfidf_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)

In [9]:
# Train the Logistic Regression model
log_reg = LogisticRegression(max_iter=200, class_weight='balanced',penalty='l2')

# Check class distribution again before fitting
unique_classes, counts = np.unique(y_train, return_counts=True)
print("Class distribution in y_train before fitting logistic regression:")
for cls, count in zip(unique_classes, counts):
    print(f"Class {cls}: {count}")

    
log_reg.fit(X_train_tfidf, y_train)

Class distribution in y_train before fitting logistic regression:
Class 0: 26
Class 1: 36
Class 2: 35


In [11]:
# Model 2: BERT
# Initialize BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Tokenize the data
def encode_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )


In [13]:
X_train_enc = encode_texts(X_train, bert_tokenizer)
X_test_enc = encode_texts(X_test, bert_tokenizer)

In [14]:
# Train the BERT model
bert_model.compile(
    optimizer=Adam(learning_rate=3e-5),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=[SparseCategoricalAccuracy()]
)

In [15]:
bert_model.fit(
    X_train_enc.data,
    y_train,
    epochs=3,
    batch_size=16,
    validation_split=0.1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x153491754d0>

In [17]:
# Function to classify new text input
def classify_text(comment, log_reg_model, bert_model, tfidf_vectorizer, bert_tokenizer):
    # Preprocess and vectorize the comment for Logistic Regression
    comment_processed = preprocess_text(comment)
    comment_tfidf = tfidf_vectorizer.transform([comment_processed])
    
    # Predict using Logistic Regression
    pred_lr = log_reg_model.predict(comment_tfidf)
    
    # Tokenize for BERT prediction
    comment_tokenized = bert_tokenizer(comment_processed, return_tensors='tf', padding=True, truncation=True, max_length=128)
    pred_bert_logits = bert_model.predict(comment_tokenized)[0]
    pred_bert = np.argmax(pred_bert_logits, axis=1)
    
    # Map integer labels back to categories
    inv_label_map = {0: 'medical doctor', 1: 'veterinarian', 2: 'other'}
    
    return {
        "Logistic Regression Prediction": inv_label_map[pred_lr[0]],
        "BERT Prediction": inv_label_map[pred_bert[0]]
    }


In [18]:
# Test the function with a new comment
new_comment = "It's was very hard for me too at the beginning, I was stressing out, but with time and training it will be easier ! I was doing 1h30 at the beginning and now I m doing 30 min! Just have to be patient and train :)|Hello, thank you for this encouragement and sharing your experience! Im actually working in small GP, but I will look for a new place with someone who will be able to train me instead to tell me I'm not good enough. Thank you :)"
print(classify_text(new_comment, log_reg, bert_model, vectorizer, bert_tokenizer))

{'Logistic Regression Prediction': 'veterinarian', 'BERT Prediction': 'veterinarian'}
