In [1]:
import pandas as pd
import numpy as np
from preprocess import *
from regression import *

from sklearn.metrics import r2_score 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn import svm
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

df_train = df_train.drop('Id',axis=1)   # remove ID column.

#MW * NumOfO * NUMOFN
df_train["MW_NumOfO_NumOfN"] = df_train["MW"] * df_train["NumOfO"]  * df_train["NumOfN"]
df_test["MW_NumOfO_NumOfN"] = df_test["MW"] * df_test["NumOfO"] * df_test["NumOfN"]
#MW * NumOfO
df_train["MW_NumOfO"] = df_train["MW"] * df_train["NumOfO"] 
df_test["MW_NumOfO"] = df_test["MW"] * df_test["NumOfO"]
#MW * NumOfN
df_train["MW_NumOfN"] = df_train["MW"] * df_train["NumOfN"] 
df_test["MW_NumOfN"] = df_test["MW"] * df_test["NumOfN"]
#NumOfAtoms * NumofC
df_train["NumOfAtoms_NumOfC"] = df_train["NumOfAtoms"] * df_train["NumOfC"] 
df_test["NumOfAtoms_NumOfC"] = df_test["NumOfAtoms"] * df_test["NumOfC"]
#NumHBondDonors * hydroxyl..alkyl.
df_train["NumHBondDonors_hydroxyl..alkyl."] = df_train["NumHBondDonors"] * df_train["hydroxyl..alkyl."] 
df_test["NumHBondDonors_hydroxyl..alkyl."] = df_test["NumHBondDonors"] * df_test["hydroxyl..alkyl."]

edf_train, enc = generate_encoding(df_train)    # encode the training data.
edf_test = apply_encoding(df_test.drop('Id', axis=1), enc)  # encode test data with the same encoder.

X_train = normalize_dataset_2(edf_train.loc[:,edf_train.columns != 'pSat_Pa'],["MW","MW_NumOfO_NumOfN","MW_NumOfO","MW_NumOfN","NumOfConf","NumOfConfUsed","NumOfAtoms","NumOfC","NumOfO","NumHBondDonors","carbonylperoxyacid","NumOfN","ketone","ester","C.C..non.aromatic.","NumOfAtoms_NumOfC","NumHBondDonors_hydroxyl..alkyl."])
y_train = np.log10(edf_train.loc[:, 'pSat_Pa'])
X_test = normalize_dataset_2(edf_test,["MW","MW_NumOfO_NumOfN","MW_NumOfO","MW_NumOfN","NumOfConf","NumOfConfUsed","NumOfAtoms","NumOfC","NumOfO","NumHBondDonors","carbonylperoxyacid","NumOfN","ketone","ester","C.C..non.aromatic.","NumOfAtoms_NumOfC","NumHBondDonors_hydroxyl..alkyl."])

X_train["parentspecies"] = X_train["parentspecies"].astype(float)
X_test["parentspecies"] = X_test["parentspecies"].astype(float)


In [3]:
regr = RandomForestRegressor(n_estimators=100, n_jobs=-1)
estimator = regr.fit(X_train, y_train)


fse = SelectFromModel(estimator, threshold=1e-4, prefit=True)

fse = fse.fit(X_train, y_train)
rf_predictions = fse.estimator_.predict(X_test)

In [4]:
params = {'coef0': 0.07764543524762932, 'tol': 4.218255155043568e-05, 'C': 6.660061688807354, 'epsilon': 0.2494711641166569}
reggressor = svm.SVR(**params)
reggressor = reggressor.fit(X_train, y_train)

fse = SelectFromModel(estimator=reggressor,
                       threshold=1e-6,
                       prefit=True)

fse = fse.fit(X_train, y_train)
predict = fse.estimator_.predict(X_test)

In [28]:
class Ensemble:
    

    params = {'coef0': 0.07764543524762932, 'tol': 4.218255155043568e-05, 'C': 6.660061688807354, 'epsilon': 0.2494711641166569}

    def __init__(self, coef0= 0.07764543524762932, tol= 4.218255155043568e-05, C= 6.660061688807354, epsilon= 0.2494711641166569, svr_weight=100, rf_weight=38) -> None:
        self.rf_weight = rf_weight
        self.svr_weight = svr_weight
        
        self.regr = RandomForestRegressor(n_estimators=100, n_jobs=-1)
        self.reggressor = svm.SVR(coef0=coef0, tol=tol, C=C, epsilon=epsilon)

    def fit(self, X, y):
        self.regr.fit(X, y)
        self.fse_rf = SelectFromModel(self.regr, threshold=1e-4, prefit=True)
        self.fse_rf = self.fse_rf.fit(X, y)

        self.reggressor.fit(X, y)
        self.fse_svr = SelectFromModel(self.reggressor, threshold=1e-4, prefit=True)
        self.fse_svr = self.fse_svr.fit(X, y)
    
    def predict(self, X):
        predict = self.fse_svr.estimator_.predict(X)

        
        rf_predictions = self.fse_rf.estimator_.predict(X)

        model_names = ["svr", "rf"]


        all_predictions = {"svr": predict, "rf": rf_predictions}
        weights = {'svr': self.svr_weight, 'rf': self.rf_weight}
        ensemble_prediction = np.sum([weights[name] * all_predictions[name] for name in model_names], axis=0)/sum(weights.values())

        return ensemble_prediction
    
    def set_weights(self, svr, rf):
        self.svr_weight = svr
        self.rf_weight = rf
    
    def get_params(self, deep):
        params["svr_weight"] = self.svr_weight
        params["rf_weight"] = self.rf_weight
        return self.params

In [None]:
model_names = ["svr", "rf"]


all_predictions = {"svr": predict, "rf": rf_predictions}


weights = {'svr': 100, 'rf': 38}


ensemble_prediction = np.sum([weights[name] * all_predictions[name] for name in model_names], axis=0)/sum(weights.values())
   

0.7224345 , 0.67875256, 0.64814991, 0.66582685, 0.67518874

In [29]:
model = Ensemble()
score = cross_validate(model, X_train, y_train, n_jobs=-1, scoring="r2", cv=5)
print(score["test_score"])

[0.76972398 0.72350628 0.69422502 0.71195655 0.73105455]


In [32]:
data = {}
test = {"svr:100, rf:38": {'svr': 100, 'rf': 38},
        "svr:100, rf:67": {'svr': 100, 'rf': 67},
        "svr:100, rf:100": {'svr': 100, 'rf': 100},
        "svr:38, rf:100": {'svr': 38, 'rf': 100},
        "svr:67, rf:100": {'svr': 67, 'rf': 100}}

for t in test.keys():
    model = Ensemble(svr_weight=test[t]["svr"], rf_weight=test[t]["rf"])
    score = cross_validate(model, X_train, y_train, n_jobs=-1, scoring="r2", cv=5)
    result = score["test_score"].tolist()
    result.append(np.mean(score["test_score"]))
    data[t] = result


In [35]:
df = pd.DataFrame.from_dict(data=data, orient="index", columns=["CV_1", "CV_2", "CV_3", "CV_4", "CV_5", "Average"])

df.head()

df.to_csv("table_ensemble.csv")