<a href="https://colab.research.google.com/github/24prady02/Prady-repository/blob/main/Student_Error_Cause.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, GPT2LMHeadModel, GPT2Tokenizer
from sklearn.model_selection import train_test_split
import pandas as pd



In [None]:
from google.colab import files

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
uploaded = files.upload()
filename = next(iter(uploaded))


Saving generated_dataset.csv to generated_dataset (1).csv


In [None]:
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
        preprocessed_text = ' '.join(lemmatized_tokens)
        return preprocessed_text
    else:
        return ''

In [None]:
error_data = pd.read_csv(filename, header=None, names=['Student ID', 'Answer', 'Error Type','Reason for the Error Type'])
error_data.drop('Student ID', axis=1, inplace=True)
error_data.head()
incorrect_answers = error_data['Answer'].tolist()
error_categories = error_data['Error Type'].tolist()
error_reasons = error_data['Reason for the Error Type'].tolist()

In [None]:
preprocessed_texts = [preprocess_text(answer) for answer in incorrect_answers]

In [None]:
def train_error_category_model(preprocessed_texts, error_categories):
    # Use LabelEncoder to convert error categories to numerical labels
    label_encoder = LabelEncoder()
    train_labels_encoded = label_encoder.fit_transform(error_categories)
    num_classes = len(label_encoder.classes_)

    train_texts, val_texts, train_labels_encoded, val_labels_encoded = train_test_split(
        preprocessed_texts, train_labels_encoded, test_size=0.2, random_state=42)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

    train_input_ids = torch.tensor(train_encodings['input_ids']).cuda()
    train_attention_mask = torch.tensor(train_encodings['attention_mask']).cuda()
    train_labels_encoded = torch.tensor(train_labels_encoded).cuda()

    val_input_ids = torch.tensor(val_encodings['input_ids']).cuda()
    val_attention_mask = torch.tensor(val_encodings['attention_mask']).cuda()
    val_labels_encoded = torch.tensor(val_labels_encoded).cuda()

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes).cuda()

    optimizer = AdamW(model.parameters(), lr=1e-5)
    loss_fn = nn.CrossEntropyLoss()

    train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels_encoded)
    val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_mask, val_labels_encoded)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

    model.train()
    for epoch in range(3):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels = labels.cuda()
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    return model, accuracy

In [None]:
error_category_model, error_category_accuracy = train_error_category_model(preprocessed_texts, error_categories)
print("Error Category Model Accuracy:", error_category_accuracy)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Error Category Model Accuracy: 0.3452728029987505


In [None]:
def extract_emotion_cause_pairs(new_texts, causes, model, tokenizer):
    error_labels = ['Lack of Understanding','Misconception','Lack of Clarity']
    new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=512)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    new_input_ids = torch.tensor(new_encodings['input_ids']).to(device)
    new_attention_mask = torch.tensor(new_encodings['attention_mask']).to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(new_input_ids, attention_mask=new_attention_mask)
        _, predicted = torch.max(outputs.logits, dim=1)

    predicted_errors = [error_labels[prediction.item()] for prediction in predicted]
    predicted_causes = [causes[prediction.item()] for prediction in predicted]
    return predicted_errors, predicted_causes

In [None]:
def generate_response(emotion, cause, model, tokenizer):
    input_text = f"Emotion: {emotion}\nCause: {cause}\nResponse:"
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
uploaded = files.upload()
filename = next(iter(uploaded))
data = pd.read_csv(filename, header=None)
error = data[1].tolist()
causes = data[3].tolist()
texts = data[2].tolist()