In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import hamming_loss, f1_score, precision_score
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Load the dataset from CSV
df = pd.read_csv('defect_prediction_dataset.csv')

# Display basic information
print('Dataset Info:')
print(df.info())
print('\nFirst few rows:')
print(df.head())

# Check for missing values
print('\nMissing Values:')
print(df.isnull().sum())

# Define feature and label columns
text_column = 'report'
label_columns = ['type_blocker', 'type_regression', 'type_bug', 'type_documentation', 
                 'type_enhancement', 'type_task', 'type_dependency_upgrade']

# 1. Preprocessing
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df[text_column]).toarray()

# Labels
y = df[label_columns].values

# Scaling the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Analyze label distribution
label_counts = df[label_columns].sum()
print('\nLabel Distribution:')
print(label_counts)
imbalance_ratios = label_counts / len(df)
print('\nImbalance Ratios:')
print(imbalance_ratios)

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(f'\nTrain set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}')

# 2. Model Building

# Logistic Regression
log_reg = OneVsRestClassifier(LogisticRegression(max_iter=1000))
param_grid_lr = {'estimator__C': [0.01, 0.1, 1, 10]}
grid_lr = GridSearchCV(log_reg, param_grid_lr, cv=3, scoring='f1_micro')
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_
print(f'Best Logistic Regression Params: {grid_lr.best_params_}')

# SVM
svm = OneVsRestClassifier(LinearSVC(max_iter=10000))
param_grid_svm = {'estimator__C': [0.01, 0.1, 1, 10]}
grid_svm = GridSearchCV(svm, param_grid_svm, cv=3, scoring='f1_micro')
grid_svm.fit(X_train, y_train)
best_svm = grid_svm.best_estimator_
print(f'Best SVM Params: {grid_svm.best_params_}')

# Perceptron with online learning
class OnlinePerceptron(nn.Module):
    def __init__(self, input_dim, num_labels):
        super(OnlinePerceptron, self).__init__()
        self.linear = nn.Linear(input_dim, num_labels)
    
    def forward(self, x):
        return self.linear(x)

input_dim = X_train.shape[1]
num_labels = y_train.shape[1]
perceptron = OnlinePerceptron(input_dim, num_labels)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(perceptron.parameters(), lr=0.01)

# Online training for Perceptron
for epoch in range(5):
    for i in range(len(X_train)):
        x = torch.tensor(X_train[i], dtype=torch.float32).unsqueeze(0)
        y = torch.tensor(y_train[i], dtype=torch.float32).unsqueeze(0)
        optimizer.zero_grad()
        outputs = perceptron(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
    print(f'Perceptron Epoch {epoch+1}, Loss: {loss.item()}')

# Deep Neural Network (DNN)
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, num_labels):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, num_labels)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

hidden_dim1 = 128
hidden_dim2 = 64
dnn = DNN(input_dim, hidden_dim1, hidden_dim2, num_labels)
criterion_dnn = nn.BCELoss()
optimizer_dnn = optim.Adam(dnn.parameters(), lr=0.001)

# DataLoader for DNN
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Training DNN
for epoch in range(10):
    dnn.train()
    for Xb, yb in train_loader:
        optimizer_dnn.zero_grad()
        outputs = dnn(Xb)
        loss = criterion_dnn(outputs, yb)
        loss.backward()
        optimizer_dnn.step()
    
    dnn.eval()
    val_loss = 0
    with torch.no_grad():
        for Xb, yb in val_loader:
            outputs = dnn(Xb)
            val_loss += criterion_dnn(outputs, yb).item()
    print(f'DNN Epoch {epoch+1}, Validation Loss: {val_loss / len(val_loader)}')

# 4. Evaluation
def evaluate_model(model, X_test, y_test, is_torch_model=False):
    if is_torch_model:
        model.eval()
        with torch.no_grad():
            outputs = model(torch.tensor(X_test, dtype=torch.float32))
            y_pred = (outputs > 0.5).numpy()
    else:
        y_pred = model.predict(X_test)
    
    hamming = hamming_loss(y_test, y_pred)
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    precision_at_k = precision_score(y_test, y_pred, average='samples')
    
    return {
        'Hamming Loss': hamming,
        'Micro-F1': micro_f1,
        'Macro-F1': macro_f1,
        'Precision@k': precision_at_k
    }

# Evaluate all models
results = {}
results['Logistic Regression'] = evaluate_model(best_lr, X_test, y_test)
results['SVM'] = evaluate_model(best_svm, X_test, y_test)
results['Perceptron'] = evaluate_model(perceptron, X_test, y_test, is_torch_model=True)
results['DNN'] = evaluate_model(dnn, X_test, y_test, is_torch_model=True)

# Display results
results_df = pd.DataFrame(results).T
print('\nModel Performance:')
print(results_df)

# Plot results
results_df.plot(kind='bar', figsize=(10, 6))
plt.title('Model Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.show()