In [43]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import mutual_info_classif, SelectFromModel
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import LinearSVC

## Example

In [5]:
def example():
    df = pd.read_csv("biodegradable_a.csv")
    x = df[["nHM", "F01"]].to_numpy().astype(int).astype(str)
    y = df["Biodegradable"].to_numpy()
    
    print(df["Biodegradable"].count())
    print(df["Biodegradable"].shape)
    
    clf = tree.DecisionTreeClassifier().fit(x, y)
    
    
example()

4564
(4564,)


## Data preprocessing

In [31]:
def cast_categorical_to_str(all_features: pd.DataFrame) -> pd.DataFrame:
    is_categorical = lambda f: all_features[f].apply(lambda x: x.is_integer()).all()
    categorical_cols = [f for f in all_features if is_categorical(f)]
    other_cols = [f for f in all_features if not is_categorical(f)]
    
    categorical_matrix = all_features[categorical_cols].to_numpy().astype(int).astype(str)
    headers = categorical_cols
    
    return pd.concat([all_features[other_cols], pd.DataFrame(data=categorical_matrix, columns=headers)], axis=1)


def one_hot_encode_categorical_features(all_features: pd.DataFrame) -> pd.DataFrame:
    is_categorical = lambda f: all_features[f].apply(lambda x: x.is_integer()).all()
    categorical_cols = [f for f in all_features if is_categorical(f)]
    other_cols = [f for f in all_features if not is_categorical(f)]
    
    encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_matrix = all_features[categorical_cols].to_numpy().astype(int).astype(str)
    categorical_encoded_matrix = encoder.fit_transform(categorical_matrix).toarray()
    
    headers = []
    for base_name, categories in zip(categorical_cols, encoder.categories_):
        for c in categories:
            headers.append(base_name + "_" + c)
        # print(f"{base_name} has {len(categories)} categories")
    
    return pd.concat([all_features[other_cols], pd.DataFrame(data=categorical_encoded_matrix, columns=headers)], axis=1)


def scale_continuous_features(all_features: pd.DataFrame) -> pd.DataFrame:
    is_continuous = lambda f: all_features[f].apply(lambda x: isinstance(x, float) and not x.is_integer()).any()
    continuous_cols = [f for f in all_features if is_continuous(f)]
    other_cols = [f for f in all_features if not is_continuous(f)]
    
    scaler = StandardScaler()
    continuous = all_features[continuous_cols]
    headers = continuous.columns
    continuous_matrix = continuous.to_numpy()
    continuous_scaled_matrix = scaler.fit_transform(continuous_matrix)
    
    return pd.concat([all_features[other_cols], pd.DataFrame(data=continuous_scaled_matrix, columns=headers)], axis=1)


def preprocess(df: pd.DataFrame) -> (pd.DataFrame, np.ndarray):
    #x = np.array(df.drop(["Biodegradable"], axis=1))
    x = df.drop(["Biodegradable"], axis=1)
    y = np.array(df["Biodegradable"])
    # x = one_hot_encode_categorical_features(x)
    x = cast_categorical_to_str(x)
    x = scale_continuous_features(x)
    return x, y


x, y = preprocess(pd.read_csv("biodegradable_a.csv"))
print(x)

     nHM F04 NssssC nCb nO F03 nN_N nArNO2 nCRX3 B01  ...      nCIR   SpMax_A  \
0      0   0      0   0  0   0    0      0     0   0  ... -0.285357 -0.921040   
1      0   0      0   0  1   0    0      0     0   0  ...  0.110875       NaN   
2      0   0      0   0  4   0    0      0     0   0  ... -0.285357       NaN   
3      0   0      0   0  2   0    0      0     0   0  ... -0.285357 -3.439015   
4      0   0      0   0  4   0    0      0     0   0  ...       NaN -0.663410   
...   ..  ..    ...  .. ..  ..  ...    ...   ...  ..  ...       ...       ...   
4559   0   0      0   0  2   0    0      0     0   0  ...       NaN       NaN   
4560   0   0      0   0  0   0    0      0     0   0  ... -0.285357 -1.797079   
4561   0   0      0   0  2   0    0      0     0   0  ... -0.285357       NaN   
4562   0   0      0   0  3   0    0      0     0   0  ... -0.285357 -0.244334   
4563   0   0      0   0  1   0    0      0     0   0  ... -0.285357 -1.301055   

      Psi_i_1d       SdO   

## Understanding data

In [95]:
def explain_data():
    df = pd.read_csv("biodegradable_a.csv")

    print(df.shape)
    
    n_incomplete_records = df[df.isna().any(axis=1)].shape[0]
    print("Number of rows with at least one missing value: " + str(n_incomplete_records))
    
    for col in df:
        n_missing_entries = df[df[col].isna()].shape[0]
        print(f"Column {str(col).ljust(14)} has {str(n_missing_entries).rjust(4)} missing values ({str(round(100*n_missing_entries/df.shape[0]))}%)")

    features = [col for col in df if col != "Biodegradable"]

    categorical = [col for col in df.drop(["Biodegradable"], axis=1) if df[col].apply(lambda x: x.is_integer()).all()]
    print(f"Categorical: {categorical}")
    
    continuous = [col for col in df.drop(["Biodegradable"], axis=1) if df[col].apply(lambda x: not x.is_integer()).any()]
    print(f"Continuous: {continuous}")
    
    binary = [col for col in categorical if df[col].apply(lambda x: x==0 or x==1).all()]
    print(f"Binary: {binary}")
    
    multicategoric = [col for col in categorical if df[col].apply(lambda x: x!=0 and x!=1).any()]
    print(f"Multicategoric: {multicategoric}")
    
    df = df.dropna()
    
    x = np.array(df.drop(["SpMax_B", "Biodegradable"], axis=1))
    y = np.array(df["Biodegradable"])
    
    
    mis = mutual_info_classif(x, y, random_state=1)
    for (x, mi) in zip(features, mis):
        print(f"{x}: {mi}")
    
    print(mis)
    print(df["nN_N"])
    print(df["nN_N"].cumsum())
    
explain_data()

(4564, 42)
Number of rows with at least one missing value: 3675
Column SpMax_L        has    0 missing values (0%)
Column J_Dz(e)        has    0 missing values (0%)
Column nHM            has    0 missing values (0%)
Column F01            has  515 missing values (11%)
Column F04            has    0 missing values (0%)
Column NssssC         has    0 missing values (0%)
Column nCb            has    0 missing values (0%)
Column C              has  767 missing values (17%)
Column nCp            has  671 missing values (15%)
Column nO             has    0 missing values (0%)
Column F03            has    0 missing values (0%)
Column SdssC          has    0 missing values (0%)
Column HyWi_B         has  479 missing values (10%)
Column LOC            has    0 missing values (0%)
Column SM6_L          has    0 missing values (0%)
Column F03_CO         has   43 missing values (1%)
Column Me             has  448 missing values (10%)
Column Mi             has    0 missing values (0%)
Column nN_N  

## Feature selection

In [68]:
def select_features_using_regularized_model(x: pd.DataFrame, y: np.ndarray) -> pd.DataFrame:
    x = x.dropna()
    y = y[x.index.values]
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(x.to_numpy(), y)
    model = SelectFromModel(lsvc, prefit=True)
    headers = [col for col, was_selected in zip(x.columns, model.get_feature_names_out(x.columns)) if was_selected]
    x_selected = model.transform(x.to_numpy())
    return pd.DataFrame(data=x_selected, columns=headers)


def select_features_using_random_forest(x: pd.DataFrame, y: np.ndarray) -> pd.DataFrame:
    x = x.dropna()
    y = y[x.index.values]
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(x.to_numpy(), y)
    # print(clf.feature_importances_)
    model = SelectFromModel(clf, prefit=True)
    headers = [col for col, was_selected in zip(x.columns, model.get_feature_names_out(x.columns)) if was_selected]
    x_selected = model.transform(x.to_numpy())
    return pd.DataFrame(data=x_selected, columns=headers)
    
    
x, y = preprocess(pd.read_csv("biodegradable_a.csv"))
selected_features = select_features_using_regularized_model(x, y)
print("Selected features using regularized model:")
print(selected_features.columns)
selected_features = select_features_using_random_forest(x, y)
print("Selected features using regularized model")
print(selected_features.columns)

selected_features = ['nHM', 'F04', 'NssssC', 'nCb', 'nO', 'F03', 'nN_N', 'nArNO2', 'nCRX3', 'B01', 'B03', 'N_073', 'B04', 'C_026', 'F02_CN', 'nHDon', 'nN']
df = pd.read_csv("biodegradable_a.csv")
print(df.shape)
print(df[selected_features].dropna().shape)

Selected features using regularized model:
Index(['nHM', 'F04', 'NssssC', 'nCb', 'nO', 'F03', 'nN_N', 'nArNO2', 'nCRX3',
       'B01'],
      dtype='object')
Selected features using regularized model
Index(['nHM', 'F04', 'NssssC', 'nCb', 'nO', 'F03', 'nN_N', 'nArNO2', 'nCRX3',
       'B01', 'B03', 'N_073', 'B04'],
      dtype='object')
(4564, 42)
(4564, 17)


## Enconding multi categorical features

In [92]:
def encode_data():
    df = pd.read_csv("biodegradable_a.csv")
    x = np.array(df.drop(["Biodegradable"], axis=1))
    y = np.array(df["Biodegradable"])
    
    encoder = OneHotEncoder(handle_unknown="ignore")
    encoder.fit(x, y)
    

encode_data()