In [1]:
import pandas as pd
import numpy as np
import random 
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("../../datasetProcessing/datasets/final.csv")

In [3]:
unique_classes = data['label'].unique()

# Randomly select two different classes
random_classes = random.sample(list(unique_classes), 2)

extracted_classes = data[data['label'].isin(random_classes)]

label_mapping = {random_classes[0]: 1, random_classes[1]: 0}
extracted_classes.loc[:, 'label'] = extracted_classes['label'].map(label_mapping)

data = extracted_classes
print(data["label"].value_counts())

label
1    9634
0    9634
Name: count, dtype: int64


In [4]:
X = data.iloc[:int(data.shape[0]), :-1]
y = data.iloc[:int(data.shape[0]), -1]

X_tr, X_te, y_tr, y_te = map(np.array, train_test_split(X, y, test_size=0.1, random_state=42))
print(X_tr.shape, y_tr.shape)
print(X_te.shape, y_te.shape)

(17341, 116) (17341,)
(1927, 116) (1927,)


In [5]:
import numpy as np

class LogisticRegression:
    def sigmoid(self, z):
        # Sigmoid function to squash values between 0 and 1
        sig = 1 / (1 + np.exp(-z))
        return sig

    def initialize(self, X):
        # Initialize weights randomly between 0 and 1
        weights = np.random.rand(X.shape[1] + 1, 1)
        X = np.c_[np.ones((X.shape[0], 1)), X]  # Adding a column of ones for the bias term
        return weights, X
    
    def fit(self, X, y, alpha=0.001, iterations=400):
        # Fit the logistic regression model using gradient descent
        weights, X = self.initialize(X)

        def cost(theta):
            # Cost function (cross-entropy) for logistic regression
            z = np.dot(X, theta)
            cost0 = y.T.dot(np.log(self.sigmoid(z)))
            cost1 = (1 - y).T.dot(np.log(1 - self.sigmoid(z)))
            cost = -((cost1 + cost0)) / len(y)
            return cost

        cost_list = np.zeros(iterations)
        for i in range(iterations):
            # Calculate gradients and update weights using gradient descent
            gradients = np.dot(X.T, self.sigmoid(np.dot(X, weights)) - np.reshape(y, (len(y), 1)))
            weights -= alpha * gradients
            cost_list[i] = cost(weights.squeeze())  # Use squeeze() to convert array to a scalar

        self.weights = weights
        return cost_list

    def predict(self, X):
        # Make predictions based on learned weights
        X = np.c_[np.ones((X.shape[0], 1)), X]  # Adding bias term to input features
        z = np.dot(X, self.weights)
        predictions = [1 if i > 0.5 else 0 for i in self.sigmoid(z)]
        return predictions

    def accuracy(self, y_true, y_pred):
        # Calculate accuracy of the model
        correct = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred))
        total = len(y_true)
        acc = correct / total
        return acc

    def error(self, y_true, y_pred):
        # Calculate error rate of the model
        err = 1 - self.accuracy(y_true, y_pred)
        return err


In [6]:
lr = LogisticRegression()
model = lr.fit(X_tr, y_tr)

In [7]:
y_pred = lr.predict(X_te)

acc = lr.accuracy(y_te, y_pred)
err = lr.error(y_te, y_pred)

print("Accuracy:", acc)
print("Error:", err)

Accuracy: 1.0
Error: 0.0


In [8]:
import numpy as np
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_tr, y_tr)

In [9]:
y_pred = lr.predict(X_te)
print('Accuracy on test set: {:.2f}'.format(lr.score(X_te, y_te)))

Accuracy on test set: 1.00


In [10]:
# Predicciones de probabilidad en el conjunto de prueba
prob_predictions = lr.predict_proba(X_te)[:, 0]  # Probabilidades para la clase 0

# Cálculo del intervalo de confianza
estimation = np.mean(prob_predictions)
z_critical = 1.96  # Para un intervalo del 95%
n = len(prob_predictions)
std_dev = np.std(prob_predictions)
confidence_interval = z_critical * (std_dev / np.sqrt(n))

# Cálculo del error
error = confidence_interval / 2

print(f"Intervalo de confianza del 95% para la probabilidad: [{estimation - confidence_interval}, {estimation + confidence_interval}]")
print(f"Error asociado: {error}")

Intervalo de confianza del 95% para la probabilidad: [0.48315492231038504, 0.527692704852506]
Error asociado: 0.011134445635530222


In [11]:
# Predicciones de probabilidad en el conjunto de prueba
prob_predictions = lr.predict_proba(X_te)[:, 1]  # Probabilidades para la clase 1

# Cálculo del intervalo de confianza
estimation = np.mean(prob_predictions)
z_critical = 1.96  # Para un intervalo del 95%
n = len(prob_predictions)
std_dev = np.std(prob_predictions)
confidence_interval = z_critical * (std_dev / np.sqrt(n))

# Cálculo del error
error = confidence_interval / 2

print(f"Intervalo de confianza del 95% para la probabilidad: [{estimation - confidence_interval}, {estimation + confidence_interval}]")
print(f"Error asociado: {error}")

Intervalo de confianza del 95% para la probabilidad: [0.472307295147494, 0.5168450776896149]
Error asociado: 0.011134445635530222
