In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from loadData import loadData
from preprocessing import preprocessing


In [2]:
"""
This script demonstrates a machine learning pipeline for binary classification using Random Forest.

1. Load the original dataset.
2. Perform preprocessing steps, including data cleaning and feature engineering.
3. Split the preprocessed data into training and testing sets (80% training, 20% testing).
4. Normalize the features using standard scaling to ensure each feature contributes equally.
5. Train a Random Forest classifier on the normalized training data.
6. Predict using the trained classifier on the test data and evaluate performance.
"""

originalDataFrame = loadData()

# Perform data preprocessing
dataFrame = preprocessing(originalDataFrame)

# Separate features (data) and target variable (targets)
data = dataFrame.iloc[:, 1:]  # Features
targets = dataFrame.iloc[:, 0]  # Target variable

# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(data, targets, test_size=0.2, random_state=42)

# Normalize the features using StandardScaler
standard_scaler_normalizer = StandardScaler()
x_train = standard_scaler_normalizer.fit_transform(x_train)
x_test = standard_scaler_normalizer.transform(x_test)

# Train a Random Forest classifier
random_forest_classifier = RandomForestClassifier(random_state=42)
random_forest_classifier.fit(x_train, y_train)

# Predict on the test set
y_pred = random_forest_classifier.predict(x_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy :  81.30
