## ML

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.pipeline import make_pipeline

# Load the dataset (replace 'your_file.csv' with the actual file path)
df = pd.read_csv('data.csv')

# Split data into features (X) and target (y)
X = df['Incident Description']
y_score = df['Severity Score']  # For regression task (predicting severity score)
y_level = df['Severity Level']  # For classification task (predicting severity level)

# Split data into train and test sets
X_train, X_test, y_train_score, y_test_score, y_train_level, y_test_level = train_test_split(
    X, y_score, y_level, test_size=0.2, random_state=42)

# Initialize a TfidfVectorizer to convert text data into numeric features
vectorizer = TfidfVectorizer(max_features=1000)

# Convert text data into numeric features using TF-IDF
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# **Model 1: Regression Task (Predict Severity Score)**
# Using RandomForestRegressor for predicting the severity score
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train_vec, y_train_score)

# Predict severity scores
y_pred_score = regressor.predict(X_test_vec)

# Evaluate the model performance
mse = mean_squared_error(y_test_score, y_pred_score)
print(f'Mean Squared Error for Severity Score (Regression): {mse}')

# **Model 2: Classification Task (Predict Severity Level)**
# Encode severity levels into numerical values
label_encoder = LabelEncoder()
y_train_level_encoded = label_encoder.fit_transform(y_train_level)
y_test_level_encoded = label_encoder.transform(y_test_level)

# Using RandomForestClassifier for predicting the severity level
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_vec, y_train_level_encoded)

# Predict severity levels
y_pred_level = classifier.predict(X_test_vec)

# Evaluate the classification performance
accuracy = accuracy_score(y_test_level_encoded, y_pred_level)
print(f'Accuracy for Severity Level (Classification): {accuracy}')


Mean Squared Error for Severity Score (Regression): 144.44219523809525
Accuracy for Severity Level (Classification): 0.7619047619047619


In [2]:
import pickle

# Save the trained models (both regressor and classifier)
with open('severity_score_model.pkl', 'wb') as f:
    pickle.dump(regressor, f)  # Save the regressor

with open('severity_level_model.pkl', 'wb') as f:
    pickle.dump(classifier, f)  # Save the classifier

# Optionally, save the vectorizer as well
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


## DL

### Severity Score

In [2]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('data.csv')

# Split data
X = df['Incident Description']
y_score = df['Severity Score']  # Regression task
y_level = df['Severity Level']  # Classification task

# Train-test split
X_train, X_test, y_train_score, y_test_score, y_train_level, y_test_level = train_test_split(
    X, y_score, y_level, test_size=0.2, random_state=42)

# Load a pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text data
max_length = 128
X_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
X_test_tokens = tokenizer(list(X_test), padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Convert labels to tensors
y_train_score = torch.tensor(y_train_score.values, dtype=torch.float32)
y_test_score = torch.tensor(y_test_score.values, dtype=torch.float32)

# Convert classification labels to numeric values
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_level_encoded = torch.tensor(label_encoder.fit_transform(y_train_level), dtype=torch.long)
y_test_level_encoded = torch.tensor(label_encoder.transform(y_test_level), dtype=torch.long)

# Custom Dataset
class IncidentDataset(Dataset):
    def _init_(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def _len_(self):
        return len(self.labels)

    def _getitem_(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]

# Create DataLoader
train_dataset = IncidentDataset(X_train_tokens, y_train_score)
test_dataset = IncidentDataset(X_test_tokens, y_test_score)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define BERT-based Regression Model
class BERTRegressor(nn.Module):
    def _init_(self):
        super(BERTRegressor, self)._init_()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.regressor = nn.Linear(768, 1)  # 768 is the output size of BERT

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.regressor(outputs.pooler_output).squeeze()

# Instantiate model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTRegressor().to(device)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
def train(model, train_loader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs, labels = batch
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)
            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

# Train the model
train(model, train_loader, criterion, optimizer)

# Evaluation
model.eval()
preds = []
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        outputs = model(**inputs)
        preds.extend(outputs.cpu().numpy())

# Compute Mean Squared Error
mse = mean_squared_error(y_test_score.numpy(), preds)
print(f'Mean Squared Error (BERT Regression): {mse}')

TypeError: IncidentDataset() takes no arguments

In [None]:
torch.save(model.state_dict(), 'bert_regression_model.pth')
print("Model saved successfully!")

### Classification

In [None]:
class BERTClassifier(nn.Module):
    def _init_(self, num_classes):
        super(BERTClassifier, self)._init_()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.classifier(outputs.pooler_output)

num_classes = len(label_encoder.classes_)
classifier = BERTClassifier(num_classes).to(device)

# Use CrossEntropyLoss for classification
criterion_cls = nn.CrossEntropyLoss()
optimizer_cls = optim.AdamW(classifier.parameters(), lr=2e-5)

# Convert labels to tensor
train_dataset_cls = IncidentDataset(X_train_tokens, y_train_level_encoded)
test_dataset_cls = IncidentDataset(X_test_tokens, y_test_level_encoded)

train_loader_cls = DataLoader(train_dataset_cls, batch_size=16, shuffle=True)
test_loader_cls = DataLoader(test_dataset_cls, batch_size=16, shuffle=False)

# Training loop for classification
def train_classifier(model, train_loader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            inputs, labels = batch
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)
            outputs = model(**inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

train_classifier(classifier, train_loader_cls, criterion_cls, optimizer_cls)

# Predict & Evaluate
classifier.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader_cls:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        outputs = classifier(**inputs)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())

accuracy = accuracy_score(y_test_level_encoded.numpy(), all_preds)
print(f'Accuracy (BERT Classification): {accuracy}')

In [None]:
torch.save(classifier.state_dict(), 'bert_classification_model.pth')
print("Model saved successfully!")