In [7]:
from itertools import product
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_selection import mutual_info_classif, SelectFromModel
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import LinearSVC

## Understanding data

In [3]:
def explain_data():
    df = pd.read_csv("biodegradable_a.csv")

    print(df.shape)
    
    n_incomplete_records = df[df.isna().any(axis=1)].shape[0]
    print("Number of rows with at least one missing value: " + str(n_incomplete_records))
    
    for col in df:
        n_missing_entries = df[df[col].isna()].shape[0]
        print(f"Column {str(col).ljust(14)} has {str(n_missing_entries).rjust(4)} missing values ({str(round(100*n_missing_entries/df.shape[0]))}%)")

    features = [col for col in df if col != "Biodegradable"]

    categorical = [col for col in df.drop(["Biodegradable"], axis=1) if df[col].apply(lambda x: x.is_integer()).all()]
    print(f"Categorical: {categorical}")
    
    continuous = [col for col in df.drop(["Biodegradable"], axis=1) if df[col].apply(lambda x: not x.is_integer()).any()]
    print(f"Continuous: {continuous}")
    
    binary = [col for col in categorical if df[col].apply(lambda x: x==0 or x==1).all()]
    print(f"Binary: {binary}")
    
    multicategoric = [col for col in categorical if df[col].apply(lambda x: x!=0 and x!=1).any()]
    print(f"Multicategoric: {multicategoric}")
    
    df = df.dropna()
    
    x = np.array(df.drop(["SpMax_B", "Biodegradable"], axis=1))
    y = np.array(df["Biodegradable"])
    
    
    mis = mutual_info_classif(x, y, random_state=1)
    for (x, mi) in zip(features, mis):
        print(f"{x}: {mi}")
    
    print(mis)
    print(df["nN_N"])
    print(df["nN_N"].cumsum())
    
explain_data()

(4564, 42)
Number of rows with at least one missing value: 3675
Column SpMax_L        has    0 missing values (0%)
Column J_Dz(e)        has    0 missing values (0%)
Column nHM            has    0 missing values (0%)
Column F01            has  515 missing values (11%)
Column F04            has    0 missing values (0%)
Column NssssC         has    0 missing values (0%)
Column nCb            has    0 missing values (0%)
Column C              has  767 missing values (17%)
Column nCp            has  671 missing values (15%)
Column nO             has    0 missing values (0%)
Column F03            has    0 missing values (0%)
Column SdssC          has    0 missing values (0%)
Column HyWi_B         has  479 missing values (10%)
Column LOC            has    0 missing values (0%)
Column SM6_L          has    0 missing values (0%)
Column F03_CO         has   43 missing values (1%)
Column Me             has  448 missing values (10%)
Column Mi             has    0 missing values (0%)
Column nN_N  

## Data preprocessing

In [4]:
def cast_categorical_to_str(all_features: pd.DataFrame) -> pd.DataFrame:
    is_categorical = lambda f: all_features[f].apply(lambda x: x.is_integer()).all()
    categorical_cols = [f for f in all_features if is_categorical(f)]
    other_cols = [f for f in all_features if not is_categorical(f)]
    
    categorical_matrix = all_features[categorical_cols].to_numpy().astype(int).astype(str)
    headers = categorical_cols
    
    return pd.concat([all_features[other_cols], pd.DataFrame(data=categorical_matrix, columns=headers)], axis=1)


def one_hot_encode_categorical_features(all_features: pd.DataFrame) -> pd.DataFrame:
    is_categorical = lambda f: all_features[f].apply(lambda x: not isinstance(x, float) or x.is_integer()).all()
    categorical_cols = [f for f in all_features if is_categorical(f)]
    other_cols = [f for f in all_features if not is_categorical(f)]
    
    encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_matrix = all_features[categorical_cols].to_numpy().astype(int).astype(str)
    categorical_encoded_matrix = encoder.fit_transform(categorical_matrix).toarray()
    
    headers = []
    for base_name, categories in zip(categorical_cols, encoder.categories_):
        for c in categories:
            headers.append(base_name + "_" + c)
        # print(f"{base_name} has {len(categories)} categories")
    
    return pd.concat([all_features[other_cols], pd.DataFrame(data=categorical_encoded_matrix, columns=headers)], axis=1)


def scale_continuous_features(all_features: pd.DataFrame) -> pd.DataFrame:
    is_continuous = lambda f: all_features[f].apply(lambda x: isinstance(x, float) and not x.is_integer()).any()
    continuous_cols = [f for f in all_features if is_continuous(f)]
    other_cols = [f for f in all_features if not is_continuous(f)]
    
    scaler = StandardScaler()
    continuous = all_features[continuous_cols]
    headers = continuous.columns
    continuous_matrix = continuous.to_numpy()
    continuous_scaled_matrix = scaler.fit_transform(continuous_matrix)
    
    return pd.concat([all_features[other_cols], pd.DataFrame(data=continuous_scaled_matrix, columns=headers)], axis=1)


def preprocess(df: pd.DataFrame) -> (pd.DataFrame, np.ndarray):
    #x = np.array(df.drop(["Biodegradable"], axis=1))
    x = df.drop(["Biodegradable"], axis=1)
    y = np.array(df["Biodegradable"])
    # x = one_hot_encode_categorical_features(x)
    x = cast_categorical_to_str(x)
    x = scale_continuous_features(x)
    return x, y


x, y = preprocess(pd.read_csv("biodegradable_a.csv"))
print(x)

     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...      nCIR   SpMax_A  \
0      0   0      0   0  0   0    0      0     0   0  ... -0.285357 -0.921040   
1      0   0      0   0  1   0    0      0     0   0  ...  0.110875       NaN   
2      0   0      0   0  4   0    0      0     0   0  ... -0.285357       NaN   
3      0   0      0   0  2   0    0      0     0   0  ... -0.285357 -3.439015   
4      0   0      0   0  4   0    0      0     0   0  ...       NaN -0.663410   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...       ...       ...   
4559   0   0      0   0  2   0    0      0     0   0  ...       NaN       NaN   
4560   0   0      0   0  0   0    0      0     0   0  ... -0.285357 -1.797079   
4561   0   0      0   0  2   0    0      0     0   0  ... -0.285357       NaN   
4562   0   0      0   0  3   0    0      0     0   0  ... -0.285357 -0.244334   
4563   0   0      0   0  1   0    0      0     0   0  ... -0.285357 -1.301055   

      Psi_i_1d       SdO   

## Feature selection

In [55]:
def rank_features_using_model(x: pd.DataFrame, y: np.ndarray, estimator) -> dict:
    x = x.dropna()
    y = y[x.index.values]
    selector = SelectFromModel(estimator).fit(x.to_numpy(), y)
    selected_columns = [col for col, was_selected in zip(x.columns, selector.get_feature_names_out(x.columns)) if was_selected]
    selected_columns = selector.get_feature_names_out(x.columns)
    x_selected = selector.transform(x.to_numpy())
    
    importances = selector.estimator_.coef_ if hasattr(selector.estimator_, "coef_") else selector.estimator_.feature_importances_
    if len(importances) == 1:
        importances = importances[0]
    ranks = dict(sorted(zip(x.columns, importances), key=lambda r: abs(r[1]), reverse=True))
    
    return ranks


def select_features_using_regularized_model(x: pd.DataFrame, y: np.ndarray) -> pd.DataFrame:
    return select_features_using_model(x, y, LinearSVC(C=0.01, penalty="l1", dual=False))
    x = x.dropna()
    y = y[x.index.values]
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(x.to_numpy(), y)
    model = SelectFromModel(lsvc, prefit=True)
    headers = [col for col, was_selected in zip(x.columns, model.get_feature_names_out(x.columns)) if was_selected]
    x_selected = model.transform(x.to_numpy())
    return pd.DataFrame(data=x_selected, columns=headers)


def select_features_using_random_forest(x: pd.DataFrame, y: np.ndarray) -> pd.DataFrame:
    # return select_features_using_model(x, y, ExtraTreesClassifier(n_estimators=50))
    x = x.dropna()
    y = y[x.index.values]
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(x.to_numpy(), y)
    # print(clf.feature_importances_)
    model = SelectFromModel(clf, prefit=True)
    headers = [col for col, was_selected in zip(x.columns, model.get_feature_names_out(x.columns)) if was_selected]
    x_selected = model.transform(x.to_numpy())
    return pd.DataFrame(data=x_selected, columns=headers)
    
    
x, y = preprocess(pd.read_csv("biodegradable_a.csv"))
features_ranked_with_svm = rank_features_using_model(x, y, LogisticRegression(C=0.1, penalty="l1", solver="saga", max_iter=1000))
features_ranked_with_rf = rank_features_using_model(x, y, RandomForestClassifier(n_estimators=1000))

import json
print(json.dumps(features_ranked_with_svm, indent=4))
print(json.dumps(features_ranked_with_rf, indent=4))

{
    "nHM": -1.4698770676520292,
    "F02_CN": -0.861933098416884,
    "nCp": -0.6995738774362152,
    "SpMax_L": -0.6109924658101019,
    "nO": 0.44481581054943864,
    "nCb": -0.4009357409653489,
    "F01": -0.32015282706975323,
    "nCrt": -0.30827329489627764,
    "LOC": 0.23797835132523126,
    "nX": -0.18748864865426515,
    "F03": -0.18691968931227684,
    "SpMax_B": -0.1558834225542856,
    "Psi_i_A": 0.017768339334521606,
    "F04": 0.0,
    "NssssC": 0.0,
    "nN_N": 0.0,
    "nArNO2": 0.0,
    "nCRX3": 0.0,
    "B01": 0.0,
    "B03": 0.0,
    "N_073": 0.0,
    "B04": 0.0,
    "C_026": 0.0,
    "nHDon": 0.0,
    "nN": 0.0,
    "nArCOOR": 0.0,
    "J_Dz(e)": 0.0,
    "C": 0.0,
    "SdssC": 0.0,
    "HyWi_B": 0.0,
    "SM6_L": 0.0,
    "F03_CO": 0.0,
    "Me": 0.0,
    "Mi": 0.0,
    "SpPosA_B": 0.0,
    "nCIR": 0.0,
    "SpMax_A": 0.0,
    "Psi_i_1d": 0.0,
    "SdO": 0.0,
    "TI2_L": 0.0,
    "SM6_B": 0.0
}
{
    "SpMax_B": 0.07741662729863712,
    "SpMax_L": 0.0764941665324

## Models training

In [54]:
def simple_cross_validation(*, model, x, y, train_partition=(0, 0.75), metrics={"accuracy": accuracy_score}):
    n, m = x.shape
    
    start = int(train_partition[0] * n)
    end = int(train_partition[1] * n)
    
    x_train = x[start:end, :]
    y_train = y[start:end]
    x_test = np.concatenate((x[:start, :], x[end:, :]))
    y_test = np.concatenate((y[:start], y[end:]))
    
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    return {metric_name: metric_f(y_test, y_pred) for (metric_name, metric_f) in metrics.items()}


def tree_classifier(x: np.ndarray, y: np.ndarray, metrics, validation):
    # Hyperparameters
    criterions = ["gini"]
    splitters = ["best"]
    max_depths = [5, 10, 20, 50, None]
    hyper_parameters = list(product(criterions, splitters, max_depths))
    
    results = {}
    
    for p in hyper_parameters:
        criterion, splitter, max_depth = p
        
        model = tree.DecisionTreeClassifier(criterion=criterion,
                                            splitter=splitter,
                                            max_depth=max_depth)
        model_results = validation(model, x, y, metrics)
        model_name = "tree-" + "-".join([str(param) for param in p])
        results[model_name] = {"model": model, "evaluation": model_results}
        
    return results


def logistic_regression(x: np.ndarray, y: np.ndarray, metrics, validation):
    results = {}

    model = LogisticRegression(max_iter=1000)
    model_results = validation(model, x, y, metrics)
    results["logistic"] = {"model": model, "evaluation": model_results}

    return results


def linear_svc(x: np.ndarray, y: np.ndarray, metrics, validation):
    results = {}

    model = LinearSVC(max_iter=10000)
    model_results = validation(model, x, y, metrics)
    results["linearsvc"] = {"model": model, "evaluation": model_results}

    return results


def random_forest(x: np.ndarray, y: np.ndarray, metrics, validation):
    results = {}

    model = RandomForestClassifier()
    model_results = validation(model, x, y, metrics)
    results["randforest"] = {"model": model, "evaluation": model_results}

    return results


def adaboost(x: np.ndarray, y: np.ndarray, metrics, validation):
    results = {}

    model = AdaBoostClassifier()
    model_results = validation(model, x, y, metrics)
    results["adaboost"] = {"model": model, "evaluation": model_results}

    return results


def model_selection(x: np.ndarray, y: np.ndarray):
    metrics = {"accuracy": accuracy_score}
    validation = lambda model, x, y, _metrics: simple_cross_validation(model=model, x=x, y=y)
    
    results = {}
    results |= tree_classifier(x, y, metrics, validation)
    results |= logistic_regression(x, y, metrics, validation)
    results |= random_forest(x, y, metrics, validation)
    results |= linear_svc(x, y, metrics, validation)
    results |= adaboost(x, y, metrics, validation)
    
    return results


df = pd.read_csv("biodegradable_a.csv")
x, y = preprocess(df)
selected_features = ['F02_CN', 'SpMax_L', 'nHM', 'F03', 'SpPosA_B', 'F04', 'nCb', 'NssssC', 'SM6_B', 'nN', 'Mi', 'SdssC'] # list(select_features_using_regularized_model(x, y).columns)
x = x[selected_features]
print(x)
x = one_hot_encode_categorical_features(x)
print(x)
import json
print(model_selection(x.to_numpy(), y))

     F02_CN   SpMax_L nHM F03  SpPosA_B F04 nCb NssssC     SM6_B nN        Mi  \
0         0 -1.320531   0   0  0.000615   0   0      0 -0.981633  0  0.328503   
1         0 -0.798293   0   0 -1.207722   0   0      0 -0.977175  0  0.423761   
2         0 -1.293482   0   0 -1.357207   0   0      0 -0.593796  0  0.804797   
3         0 -3.232628   0   0 -2.204289   0   0      0 -1.609081  0  1.519239   
4         0 -0.660971   0   0 -0.796638   0   0      0 -0.145778  0  0.566650   
...     ...       ...  ..  ..       ...  ..  ..    ...       ... ..       ...   
4559      0 -0.786537   0   0 -1.008886   0   0      0 -0.810285  0  0.726586   
4560      0 -1.456332   0   0  0.976838   0   0      0 -0.693530  0  0.212358   
4561      0 -0.538691   0   0 -0.926756   0   0      0 -0.484217  0  0.646039   
4562      0  0.013934   0   0 -0.629899   0   0      0 -0.250759  0  0.342948   
4563      2 -1.058169   0   0 -1.140085   0   0      0 -1.415928  1  1.322359   

         SdssC  
0     0.39