**1.Data Loading**

In [2]:
# Install simpletransformers package
!pip install simpletransformers



Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.44.0-py3-none-any.whl.metadata (8.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->sim

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
# Load the dataset (replace with your dataset path)
data = pd.read_csv('Processed_dataset.csv')

In [6]:
# Exploratory Data Analysis (EDA)
print(data.info())  # Overview of data structure
print(data['Output'].value_counts())  # Class distribution

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Input                   1000 non-null   object 
 1   Output                  1000 non-null   object 
 2   cleaned_text            1000 non-null   object 
 3   char_count              1000 non-null   int64  
 4   word_count              1000 non-null   int64  
 5   avg_word_length         1000 non-null   float64
 6   stopword_count          1000 non-null   int64  
 7   unique_word_count       1000 non-null   int64  
 8   sentiment_polarity      1000 non-null   float64
 9   sentiment_subjectivity  1000 non-null   float64
 10  Encoded_Output          1000 non-null   int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 86.1+ KB
None
Output
Rejected    587
Approved    413
Name: count, dtype: int64


In [7]:
# Split dataset into train and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [8]:
 #Preparing the data in the correct format for SimpleTransformers
train_df = pd.DataFrame({
    'cleaned_text': train_data['cleaned_text'],
    'Output': train_data['Output']
})

val_df = pd.DataFrame({
    'cleaned_text': val_data['cleaned_text'],
    'Output': val_data['Output']
})

**2.TEXT PREPROCESSING**

In [9]:
import re

In [10]:
# Define a function to clean text data
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = text.strip()

    return text

In [11]:
# Apply the cleaning function to the dataset
train_df['cleaned_text'] = train_df['cleaned_text'].apply(clean_text)
val_df['cleaned_text'] = val_df['cleaned_text'].apply(clean_text)

print(train_df.head())

                                          cleaned_text    Output
29   applicant earns annually credit score outstand...  Rejected
535  applicant earns annually credit score outstand...  Approved
695  applicant earns annually credit score outstand...  Rejected
557  applicant earns annually credit score outstand...  Rejected
836  applicant earns annually credit score outstand...  Approved


**3.Text Embedding using BERT and RoBERTa**

In [12]:
from simpletransformers.classification import ClassificationModel

In [13]:
# Create a BERT model for text classification
bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=2, use_cuda=False)  # Set use_cuda=True if using a GPU

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
# Create a RoBERTa model for text classification
roberta_model = ClassificationModel('roberta', 'roberta-base', num_labels=2, use_cuda=False)  # Set use_cuda=True if using a GPU

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

**4. Model Training with BERT and RoBERTa`**

In [15]:
train_df = pd.DataFrame({
    'text': train_data['Input'],
    'labels': train_data['Encoded_Output'].astype(int)
})

In [16]:
# Train BERT model
bert_model.train_model(train_df)

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/100 [00:00<?, ?it/s]

(100, 0.6185267148911953)

In [17]:
# Train RoBERTa model
roberta_model.args.overwrite_output_dir = True

In [18]:
roberta_model.train_model(train_df)

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/100 [00:00<?, ?it/s]

(100, 0.5472492251172661)

In [19]:
from simpletransformers.classification import ClassificationArgs

# Set up model arguments with custom hyperparameters
model_args = ClassificationArgs(
    num_train_epochs=3,       # Start with 3 epochs
    train_batch_size=8,       # Use a batch size of 8
    eval_batch_size=8,        # Same for evaluation
    learning_rate=3e-5,       # Learning rate
    max_seq_length=128,       # Max sequence length
    weight_decay=0.01,        # Weight decay
    warmup_steps=0,           # Optional: adjust based on total steps
    logging_steps=50,         # Log training progress every 50 steps
    save_steps=200,           # Save the model every 200 steps
)


In [20]:
model_args.overwrite_output_dir = True

In [None]:
# Train the BERT model with custom hyperparameters
bert_model = ClassificationModel('bert', 'bert-base-uncased', num_labels=2, args=model_args, use_cuda=False)
bert_model.train_model(train_df)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/100 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/100 [00:00<?, ?it/s]

In [1]:
# Train the RoBERTa model with custom hyperparameters
roberta_model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=model_args, use_cuda=False)
roberta_model.train_model(train_df)

NameError: name 'ClassificationModel' is not defined


**5. Evaluation on Validation Set**

In [None]:
# Evaluate BERT on validation data
result_bert, model_outputs_bert, wrong_predictions_bert = bert_model.eval_model(train_df)

print("BERT Evaluation Results:")
print(result_bert)



  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: too many dimensions 'str'

In [None]:
# Evaluate RoBERTa on validation data
result_roberta, model_outputs_roberta, wrong_predictions_roberta = roberta_model.eval_model(train_df)

print("RoBERTa Evaluation Results:")
print(result_roberta)

NameError: name 'roberta_model' is not defined

**6. Saving the Best Model**

In [None]:
bert_model.save_model('bert_best_model')

NameError: name 'bert_model' is not defined

In [None]:
roberta_model.save_model('roberta_best_model')

**7. Prediction on Real-World Input**

Prediction Using BERT Model

In [None]:
# Real-world input text
real_world_text = ["This is a great product!", "I didn't like the service."]


# Predict the class
predictions_bert, _ = bert_model.predict(real_world_text)

print(f"BERT Predictions: {predictions_bert}")

NameError: name 'bert_model' is not defined

In [None]:
# Real-world input text
real_world_text = ["This is a great product!", "I didn't like the service."]

# Predict the class
predictions_roberta, _ = roberta_model.predict(real_world_text)

print(f"RoBERTa Predictions: {predictions_roberta}")

NameError: name 'roberta_model' is not defined