In [3]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [4]:
def load_data(train_path, test_path, test_validate):
    col_names = ["age", "workclass", "education",
                 "marital-status", "occupation", "race",
                 "sex", "hours-per-week",
                 "country", "income"]
    train_data = pd.read_csv(train_path, header=None, names=col_names)
    test_data = pd.read_csv(test_path, header=None, names=col_names)
    dev_data = pd.read_csv(test_validate, header= None, names=col_names)

    return train_data, test_data, dev_data

def standardize_data(train_data, test_data, dev_data):
    # Fit scaler on train data only. Transform training and testing set
    numerical_col = ["age", "hours-per-week"]
    scaler = StandardScaler()
    train_data[numerical_col] = scaler.fit_transform(train_data[numerical_col])
    test_data[numerical_col] = scaler.fit_transform(test_data[numerical_col])
    dev_data[numerical_col] = scaler.fit_transform(dev_data[numerical_col])
    return train_data, test_data, dev_data


def split_data(train_data, test_data, dev_data):

    y_train = train_data["income"]
    X_train = train_data.drop("income", axis=1)

    y_test = test_data['income']
    X_test = test_data.drop("income", axis=1)

    y_dev = dev_data['income']
    X_dev = dev_data.drop("income", axis=1)

    return X_train, y_train, X_test, y_test, X_dev, y_dev

def ohe_data(X_train, y_train, X_test, y_test, X_dev, y_dev):
    """
    One hot encode categorical data.
    Args:
        X_train: Train features as Pandas DataFrame.
        y_train: Train labels as Pandas Series.
        X_test: Test features as Pandas DataFrame.
        y_test: Test labels as Pandas Series.
    Returns:
        X_train_ohe: One-hot encoded training features as Pandas DataFrame.
        y_train_ohe: One-hot encoded training labels as Pandas Series.
        X_test_ohe: One-hot encoded testing features as Pandas DataFrame.
        y_test_ohe: One-hot encoded testing labels as Pandas Series.
    """
    data = pd.concat([X_train, X_test])
    data_1 = pd.concat([X_train, X_dev])

    data_ohe = pd.get_dummies(data)
    data_ohe1 = pd.get_dummies(data_1)
    X_train_ohe = data_ohe[:len(X_train)]
    X_test_ohe = data_ohe[len(X_train):]
    X_dev_ohe = data_ohe1[len(X_train):]
    y_train_ohe = y_train.replace([' <=50K', ' >50K'], [-1, 1])
    y_test_ohe = y_test.replace([' <=50K', ' >50K'], [-1, 1])
    y_dev_ohe = y_dev.replace([' <=50K', ' >50K'], [-1, 1])
    X_train_ohe = np.array(X_train_ohe)
    y_train_ohe = np.array(y_train_ohe)
    X_test_ohe  = np.array(X_test_ohe)
    y_test_ohe  = np.array(y_test_ohe)
    X_dev_ohe = np.array(X_dev_ohe)
    y_dev_ohe= np.array(y_dev_ohe)
    return X_train_ohe, y_train_ohe, X_test_ohe, y_test_ohe, X_dev_ohe, y_dev_ohe

def preprocess_data():
    path_to_train = "income.train.txt"
    path_to_test = "income.test.txt"
    path_to_validate = "income.dev.txt"
    # Load the data
    print("Loading data...")
    train_data, test_data, dev_data = load_data(path_to_train, path_to_test,path_to_validate)
    # Standardize the data
    print("Standardizing the data...")
    train_data, test_data ,dev_data= standardize_data(train_data, test_data,dev_data)
    # Split data into features and labels
    X_train, y_train, X_test, y_test,X_dev, y_dev = split_data(train_data, test_data,dev_data)
    # One-hot encode the data
    X_train, y_train, X_test, y_test, X_dev, y_dev = ohe_data(X_train, y_train, X_test, y_test,X_dev,y_dev)

    return X_train, y_train, X_test, y_test, X_dev, y_dev
if __name__ == "__main__":

    X_train, y_train, X_test, y_test, X_dev, y_dev = preprocess_data()
    print("\nData sucessfully loaded.")
    

Loading data...


FileNotFoundError: File b'income.train.txt' does not exist