### Imports

In [7]:
import numpy as np
import pandas as pd

### Logistic Regression Class

In [2]:
# Logistic Regression
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape

        # Initialize weights and bias
        self.weights = np.zeros(num_features)
        self.bias = 0

        old_cost = None

        # Gradient descent
        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self._sigmoid(linear_model)

            # Compute cost
            J = (-1 / num_samples) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

            # Check stopping criterion
            if old_cost is not None and abs(J - old_cost) < 0.00001:
                break

            # Update weights and bias
            dw = (1 / num_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / num_samples) * np.sum(y_pred - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            old_cost = J

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_pred = self._sigmoid(linear_model)
        y_pred_cls = [1 if i > 0.5 else 0 for i in y_pred]
        return y_pred_cls

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


### Import Data

In [17]:
# import setosa and virginica in the iris dataset
df = pd.read_csv('iris.csv')
df = df[df['variety'].isin(['Setosa', 'Virginica'])]
df = df.reset_index(drop=True)

df.head()


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


### Create Training and Testing sets

In [18]:
# use logistic regression to classify setosa and virginica
X = df.drop('variety', axis=1)
y = df['variety'].map({'Setosa': 0, 'Virginica': 1})

# split data into train and test sets without using sklearn
num_samples = X.shape[0]
num_train = int(0.8 * num_samples)
num_test = num_samples - num_train

X_train = X.iloc[:num_train]
X_test = X.iloc[num_train:]
y_train = y.iloc[:num_train]
y_test = y.iloc[num_train:]


### Fit and Predict then display Accuracy

In [19]:
# train model
model = LogisticRegression()
model.fit(X_train, y_train)

# predict on test set
y_pred = model.predict(X_test)

# evaluate model
accuracy = np.sum(y_pred == y_test) / num_test
print('Accuracy:', accuracy)


Accuracy: 1.0


#### Use Cross Validation to check for overfitting by making sure validation accuracy is not higher than test accuracy.

In [20]:
# Define the proportions for the splits
train_prop = 0.7
val_prop = 0.15
test_prop = 0.15

# Calculate the number of samples
num_samples = len(X)

# Calculate the indices for the splits
train_end = int(train_prop * num_samples)
val_end = int((train_prop + val_prop) * num_samples)

# Split the data
X_train, X_val, X_test = np.split(X, [train_end, val_end])
y_train, y_val, y_test = np.split(y, [train_end, val_end])

# Now you can train your model on the training set, tune it on the validation set, and evaluate it on the test set
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = model.predict(X_val)

# Evaluate model on validation set
val_accuracy = np.sum(y_val_pred == y_val) / len(y_val)
print('Validation Accuracy:', val_accuracy)

# Predict on test set
y_test_pred = model.predict(X_test)

# Evaluate model on test set
test_accuracy = np.sum(y_test_pred == y_test) / len(y_test)
print('Test Accuracy:', test_accuracy)

Validation Accuracy: 1.0
Test Accuracy: 1.0
