**preprocessing started**

In [None]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
pip install openpyxl



In [None]:
def replace_non_numeric_with_zero(value):
    try:
        numeric_value = float(value)
        return numeric_value
    except (ValueError, TypeError):
        return 0

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Read data from an Excel file
data = pd.read_excel("Quiz_responces_latest.xlsx")

# Preprocess the responses and associate marks
preprocessed_data = []

for i in range(0, len(data.columns), 2):
    response_column = data.columns[i]
    mark_column = data.columns[i + 1]

    for index, row in data.iterrows():
        response = row[response_column]
        mark = row[mark_column]

        if isinstance(response, str):  # Check if the response is a string
            preprocessed_response = preprocess_text(response)
            preprocessed_data.append((preprocessed_response, mark))

# Create a new DataFrame with preprocessed data
preprocessed_df = pd.DataFrame(preprocessed_data, columns=['processedResponse', 'Marks'])

preprocessed_df['Marks']=preprocessed_df['Marks'].apply(replace_non_numeric_with_zero)

# Save the preprocessed data to a new Excel file
preprocessed_df.to_excel("preprocessed_data.xlsx", index=False)

In [None]:
data1 = pd.read_excel("preprocessed_data.xlsx")

In [None]:
data1.isna().sum()

processedResponse    13
Marks                 0
dtype: int64

**Preprocessing done now data splitting started**

In [None]:
pip install pandas scikit-learn




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read your preprocessed data from the Excel file
preprocessed_df = pd.read_excel("preprocessed_data.xlsx")

print(preprocessed_df.shape)
# Split the data into features (X) and target variable (y)
X = preprocessed_df['processedResponse']  # Features (input data)
y = preprocessed_df['Marks']  # Target variable (labels)

# Split the data into a training set (80%) and a test set (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=100)

# Now you have your training and test sets in X_train, X_test, y_train, and y_test.
# You can use these sets for machine learning model training and testing.


(2672, 2)


In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2137,)
(535,)
(2137,)
(535,)


In [None]:
# Create a DataFrame from the lists
data = {'Ans': X_train, 'Marks':y_train}
df_train = pd.DataFrame(data)

In [None]:
data = {'Ans': X_test, 'Marks':y_test}
df_test = pd.DataFrame(data)

In [None]:
df_train.to_excel('training_data.xlsx', index=False)
df_test.to_excel('testing_data.xlsx', index=False)

In [None]:
df_train

Unnamed: 0,Ans,Marks
42,algorithm analysis necessary understand workin...,2.5
2447,two different graph root node different sub no...,2.5
1053,insertion selection sort different working pri...,2.0
1658,recursive function one make formula one case r...,1.0
474,data item connected linear manner example array,2.5
...,...,...
350,,0.0
1930,feature adt abstract data structure,1.0
79,algorithm analalysis helpfull case want greedy...,2.5
1859,prefix abcdnpostfix abcd,2.5


In [None]:
df_test

Unnamed: 0,Ans,Marks
1010,linear searching searching using one iterator ...,2.0
302,omega theta big,2.5
1421,avl tree binary tree height subtrees balanced ...,1.0
1426,avl tree binary search tree balance node either,1.0
2606,graph contain cycle,2.5
...,...,...
1725,interpolation search modified binary search in...,2.5
57,need algorithm analysis choose optimal method ...,2.5
2560,non deterministic polynomial complexity equati...,2.5
1191,quick sort algorithm take element array called...,2.5


**Model Training Started**

In [None]:
pip install transformers



In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import mean_absolute_error

In [None]:
print(torch.cuda.is_available())

True


In [None]:
# Load your preprocessed dataset from an Excel file
df_for_training = pd.read_excel("training_data.xlsx")

# Extract answers and corresponding labels

answers = df_for_training['Ans'].astype(str).tolist()
labels = df_for_training['Marks'].astype(float).tolist()


In [None]:
type(answers)

list

In [None]:
type(labels)

list

In [None]:
# Load a pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)  # Use 1 label for regression
model.to('cuda')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Tokenize answers and convert to tensors
inputs = tokenizer(answers, padding=True, return_tensors='pt', max_length=128, truncation=True, return_attention_mask=True)

input_ids = inputs['input_ids']
attention_masks = inputs['attention_mask']
labels = torch.tensor(labels, dtype=torch.float32)  # Ensure labels are in the correct data type

In [None]:
# Create DataLoader objects
batch_size = 8
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=batch_size,pin_memory=True)

In [None]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader))




In [None]:
# Training loop
num_epochs = 8

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to('cuda'), attention_mask.to('cuda'), labels.to('cuda')
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / len(dataloader)

    print(f"Epoch {epoch+1} - Training Loss: {average_loss}")

Epoch 1 - Training Loss: 0.5166730555191413
Epoch 2 - Training Loss: 0.3531507983136533
Epoch 3 - Training Loss: 0.35078120116255623
Epoch 4 - Training Loss: 0.3524112616415455
Epoch 5 - Training Loss: 0.34677802672637487
Epoch 6 - Training Loss: 0.3560106208923259
Epoch 7 - Training Loss: 0.3456609064240509
Epoch 8 - Training Loss: 0.35189166865937077


In [None]:
model.to('cpu')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

**Model Testing**  

In [None]:
test_df = pd.read_excel('testing_data.xlsx')

In [None]:

test_df['Marks']

0      2.0
1      2.5
2      1.0
3      1.0
4      2.5
      ... 
530    2.5
531    2.5
532    2.5
533    2.5
534    1.0
Name: Marks, Length: 535, dtype: float64

In [None]:
test_answers = test_df['Ans'].astype(str).tolist()

In [None]:
test_inputs = tokenizer(test_answers, padding=True, return_tensors='pt', max_length=128, truncation=True, return_attention_mask=True)

test_input_ids = test_inputs['input_ids']
test_attention_masks = test_inputs['attention_mask']

In [None]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
batch_size = 8
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
model.eval()
predicted_scores = []

for batch in test_dataloader:
    input_ids, attention_mask = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predicted_scores.extend(outputs.logits)

# Extract predicted scores
predicted_scores = [score.item() for score in predicted_scores]


In [None]:
diff = []
for value1, value2 in zip(predicted_scores, test_df['Marks']):
  diff.append(abs(value1-value2))

In [None]:
diff_dict = {'Predicted':predicted_scores,'Actual':test_df['Marks'],'Diff':diff}

In [None]:
df_result = pd.DataFrame(diff_dict)

In [None]:
df_result

Unnamed: 0,Predicted,Actual,Diff
0,2.173944,2.0,0.173944
1,1.757790,2.5,0.742210
2,0.923333,1.0,0.076667
3,0.744498,1.0,0.255502
4,2.018064,2.5,0.481936
...,...,...,...
530,2.189530,2.5,0.310470
531,2.319713,2.5,0.180287
532,2.030257,2.5,0.469743
533,2.210973,2.5,0.289027


**sorting the data frame according to diff value so as to see which class is giving bad results**

In [None]:
diff_list = [{'Predicted': p, 'Actual': a, 'Diff': d} for p, a, d in zip(predicted_scores, test_df['Marks'], diff)]


In [None]:
sorted_diff_list = sorted(diff_list, key=lambda x: x['Diff'],reverse=True)

In [None]:
sorted_diff_dict = {
    'Predicted': [item['Predicted'] for item in sorted_diff_list],
    'Actual': [item['Actual'] for item in sorted_diff_list],
    'Diff': [item['Diff'] for item in sorted_diff_list]
}


In [None]:
diff_in_sorted = pd.DataFrame(sorted_diff_dict)

In [None]:
diff_in_sorted

Unnamed: 0,Predicted,Actual,Diff
0,2.392380,0.0,2.392380
1,2.323740,0.0,2.323740
2,2.087089,0.0,2.087089
3,2.060614,0.0,2.060614
4,1.952469,0.0,1.952469
...,...,...,...
530,2.495388,2.5,0.004612
531,2.496935,2.5,0.003065
532,2.497411,2.5,0.002589
533,2.497454,2.5,0.002546


**evaluationg model performance using all metrics**

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [None]:
actual_marks = test_df['Marks'].astype(float).tolist()

In [None]:
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(actual_marks, predicted_scores)

In [None]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(actual_marks, predicted_scores)

In [None]:
# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

In [None]:
# Calculate R-squared (R2) score
r2 = r2_score(actual_marks, predicted_scores)

In [None]:
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")

Mean Absolute Error (MAE): 0.41111087136179486
Mean Squared Error (MSE): 0.34387721497576773
Root Mean Squared Error (RMSE): 0.5864104492382172
R-squared (R2) Score: 0.32030291034093783


In [None]:
# Save the trained model
# model.save_pretrained("short_answer_grading_model")

#User Testing

In [None]:
test_answer2="lorem ipsum sodor item not godd , my friend wife is beautifful and she is bootiful, with a booty"

In [None]:
test_inputs = tokenizer(test_answer2, padding=True, return_tensors='pt', max_length=128, truncation=True, return_attention_mask=True)

test_input_ids = test_inputs['input_ids']
test_attention_masks = test_inputs['attention_mask']

In [None]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
batch_size = 8
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
model.eval()
predicted_scores = []

for batch in test_dataloader:
    input_ids, attention_mask = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predicted_scores.extend(outputs.logits)

# Extract predicted scores
predicted_scores = [score.item() for score in predicted_scores]

In [None]:
predicted_scores

[1.5804128646850586]