# Hyer ML Logistic Regression

This script outlines a machine learning project employing a simple Neural Network using PyTorch to predict task fill status based on specific features. Initially, the script loads a dataset and preprocesses the data by parsing date-time information and encoding categorical variables. Following data preprocessing, the dataset is split into training and testing sets. A simple neural network model is then defined using PyTorch, comprising of three layers. The model is trained on the training set using a specified number of epochs. Post training, the model's performance is evaluated on the testing set using metrics such as accuracy, confusion matrix, and a classification report to understand its predictive capability regarding the task fill status.

## Import necessary libraries and modules

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from torch.utils.data import DataLoader, TensorDataset

## Load the data

In [None]:
print('Loading data...')
# Use the appropriate file path and file type (CSV or Excel) to load your data
# data = pd.read_excel('data_complete.xlsx')
data = pd.read_csv('subset.csv', low_memory=False)

## Data Exploration and Preprocessing

In [None]:
print(f'Preprocessing records: {len(data)}')

# Convert the date column to datetime format
data['DateCreated'] = pd.to_datetime(data['DateCreated'])

# Preprocessing
data['HourOfDay'] = data['DateCreated'].dt.hour
data['DayOfWeek'] = data['DateCreated'].dt.dayofweek
data['EstimatedHours'] = data['EstimatedNumberOfSeconds'] / 3600  # Convert seconds to hours

# Encoding categorical variables
data['PrivatePublic_encoded'] = data['Private or Public'].apply(lambda x: 0 if x == 'Public' else 1)

## Split the data

In [None]:
print('Splitting data...')

X = data[['HourOfDay', 'DayOfWeek', 'EstimatedHours', 'PrivatePublic_encoded']]
y = data['Task Fill Status'].apply(lambda x: 1 if x == 'Filled' else 0)

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)  # Convert to numpy arrays

## Scale the data

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Convert data to torch tensors

In [None]:
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

## Create a TensorDataset and DataLoader

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

## Define the model

In [None]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(4, 64)  # 4 input features
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

model = SimpleNN()

## Define the loss function and optimizer

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Train the model

In [None]:
num_epochs = 10

print(f'Training model ({num_epochs} Epochs)...')

for epoch in range(num_epochs):
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

## Evaluate the model

In [None]:
print('Evaluating model...')

model.eval()

# Get the model's predictions on the test data
with torch.no_grad():
    y_pred_prob = model(X_test_tensor).squeeze().numpy()

# Convert the predictions to binary values (0 or 1)
y_pred = (y_pred_prob > 0.5).astype(int)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')