In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from loadData import loadData
from preprocessing import preprocessing


In [3]:
"""
This script demonstrates a basic machine learning pipeline for binary classification using logistic regression.

1. Load the original dataset.
2. Perform preprocessing steps, including data cleaning and feature engineering.
3. Split the preprocessed data into training and testing sets (80% training, 20% testing).
4. Normalize the features using standard scaling to ensure each feature contributes equally.
5. Train a logistic regression classifier on the training data.
6. Predict using the trained classifier on the test data and evaluate performance.
"""

originalDataFrame = loadData()

# Preprocessing step: Apply preprocessing to the original data
dataFrame = preprocessing(originalDataFrame)

# Separate features and target variable
data = dataFrame.iloc[:, 1:]  # Features
targets = dataFrame.iloc[:, 0]  # Target variable

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(data, targets, test_size=0.2, random_state=42)

# Normalization: Scale features using StandardScaler
standard_scaler_normalizer = StandardScaler()
x_train = standard_scaler_normalizer.fit_transform(x_train)
x_test = standard_scaler_normalizer.transform(x_test)

# Logistic Regression classifier initialization and training
logistic_regression_classifier = LogisticRegression(random_state=42)
logistic_regression_classifier.fit(x_train, y_train)

# Predict on the test set
y_pred = logistic_regression_classifier.predict(x_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 73.28%
