In [180]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.decomposition import PCA


class DataTransformer():
    def __init__(self, val_size=0.25,
                 encodeTarget=False,
                 dummies=False,
                 scaler="standard",
                 imputer="simple"):
        
        # self.group_dictionary = pd.read_csv('data/group_dictionary.csv', sep=';')
        self.test_data_no_target = pd.read_csv('data/test_data_no_target.csv', sep=';', decimal=',')
        self.training_data = pd.read_csv('data/training_data.csv', sep=';', decimal=',')
        # self.column_names_dictionary = pd.read_csv('data/column_names_dictionary.csv', sep=';')

        if encodeTarget:
            self.labelEncodeTarget()

        if dummies:
            self.makeDummies()
        
        self.X = self.training_data.drop(columns=['Class','Perform'])  
        self.y = self.training_data[['Class']]


        # collect cat_cols, bin_cols, num_cols
        # num_cols are then used for scaling
        # cat_cols are gonna be encoded
        self.cat_cols = []
        self.bin_cols = []
        self.num_cols = []
        for col in self.X.columns.tolist():
            if len(self.X[col].value_counts()) > 2 and len(self.X[col].value_counts())<10:
                self.cat_cols.append(col)
            elif len(self.X[col].value_counts()) == 2:
                self.bin_cols.append(col)
            else:
                self.num_cols.append(col)

    
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(self.X, self.y, test_size=val_size, random_state=42)

        if imputer == "simple":
            self.simpleImpute(0)
        
        if scaler == "standard":
            self.standardScale()
        
        elif scaler == "minmax":
            self.minMaxScale()

        

    
    def get_X_train(self):
        return self.X_train

    
    def get_y_train(self):
        return self.y_train

    
    def get_X_val(self):
        return self.X_val

    
    def get_y_val(self):
        return self.y_val

    
    def makeDummies(self):
        self.training_data = pd.concat((self.training_data.drop(columns=['Group']), pd.get_dummies(self.training_data['Group'])), axis=1)
        self.test_data_no_target = pd.concat((self.test_data_no_target.drop(columns=['Group']), pd.get_dummies(self.test_data_no_target['Group'])), axis=1)
        

    def knnImpute(self):
        self.knnImputer = KNNImputer(n_neighbors=4)
        self.X_train[self.num_cols] = self.knnImputer.fit_transform(self.X_train[self.num_cols])
        self.X_val[self.num_cols] = self.knnImputer.transform(self.X_val[self.num_cols])
        self.test_data_no_target[self.num_cols] = self.knnImputer.transform(self.test_data_no_target[self.num_cols])


    def simpleImpute(self, value):
        self.simpleImputer = SimpleImputer(strategy='mean')
        self.X_train[self.num_cols] = self.simpleImputer.fit_transform(self.X_train[self.num_cols])
        self.X_val[self.num_cols] = self.simpleImputer.transform(self.X_val[self.num_cols])
        self.test_data_no_target[self.num_cols] = self.simpleImputer.transform(self.test_data_no_target[self.num_cols])


    def standardScale(self):
        self.stdScaler = StandardScaler()
        self.X_train[self.num_cols] = self.stdScaler.fit_transform(self.X_train[self.num_cols])
        self.X_val[self.num_cols] = self.stdScaler.transform(self.X_val[self.num_cols])
        self.test_data_no_target[self.num_cols] = self.stdScaler.transform(self.test_data_no_target[self.num_cols])
        

    def minMaxScale(self):
        self.minMaxScale = MinMaxScaler()
        self.X_train[self.num_cols] = self.minMaxScale.fit_transform(self.X_train[self.num_cols])
        self.X_val[self.num_cols] = self.minMaxScale.transform(self.X_val[self.num_cols])
        self.test_data_no_target[self.num_cols] = self.minMaxScale.transform(self.test_data_no_target[self.num_cols])

    
    def PCASelection(self):
        pass

    
    def labelEncodeTarget(self):
        # label encodes target to be 0, 1, 2
        self.target_encoder = LabelEncoder()
        self.training_data['Class'] = self.training_data[['Class']].apply(self.target_encoder.fit_transform)
        

    def labelDecodeTarget(self, data):
        # returns python list decoding back to -1, 0, 1
        return self.target_encoder.inverse_transform(data.ravel()).tolist()

        



class Modelling:
    def __init__(self):
        pass

    def randomForest():
        pass
        
    def catBoost():
        pass


    def get_leaderboard():
        pass

        
        
        

In [181]:
data = DataTransformer(encodeTarget=False, dummies=True)

In [182]:
X_train = data.get_X_train()
X_val = data.get_X_val()
y_train = data.get_y_train()
y_val = data.get_y_val()


In [185]:
import cost_function

print(cost_function.evaluate_error(y_pred,y_val))

0.8825


In [187]:
class CountErrors:
    '''Count number of wrong predictions'''
    
    def is_max_optimal(self):
        False # Lower is better

    def evaluate(self, approxes, target, weight):  
        y_pred = np.array(approxes).argmax(0)
        y_true = np.array(target)
        cost_matrix = np.array([[0, 1, 2], 
                                [1, 0, 1], 
                                [2, 1, 0]])

        conf_matrix = confusion_matrix(y_true, y_pred)
        loss = np.sum(conf_matrix * cost_matrix) / len(y_true)
                                    
        return loss, 1

    def get_final_error(self, error, weight):
        return error

model = CatBoostClassifier(cat_features=data.cat_cols,
                           metric_period=50,
                           n_estimators=500,
                           eval_metric=CountErrors()
                          )

model.fit(X_train, y_train, eval_set=(X_val, y_val))

Learning rate set to 0.145843


Failed in nopython mode pipeline (step: nopython frontend)
Untyped global name 'confusion_matrix': Cannot determine Numba type of <class 'function'>

File "../../../var/folders/7y/57xm51hj4y11r4hg1s330bth0000gn/T/ipykernel_20985/3983683262.py", line 14:
<source missing, REPL/exec in use?>

  self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)


0:	learn: 0.8896667	test: 0.9360000	best: 0.9360000 (0)	total: 167ms	remaining: 1m 23s
50:	learn: 0.6375000	test: 0.8825000	best: 0.8825000 (50)	total: 1.16s	remaining: 10.2s
100:	learn: 0.5075000	test: 0.9010000	best: 0.8825000 (50)	total: 2.12s	remaining: 8.39s
150:	learn: 0.4105000	test: 0.9010000	best: 0.8825000 (50)	total: 3.03s	remaining: 7.01s
200:	learn: 0.3098333	test: 0.9045000	best: 0.8825000 (50)	total: 3.92s	remaining: 5.84s
250:	learn: 0.2436667	test: 0.8990000	best: 0.8825000 (50)	total: 4.83s	remaining: 4.8s
300:	learn: 0.1845000	test: 0.9110000	best: 0.8825000 (50)	total: 5.8s	remaining: 3.83s
350:	learn: 0.1435000	test: 0.9130000	best: 0.8825000 (50)	total: 6.79s	remaining: 2.88s
400:	learn: 0.1093333	test: 0.9000000	best: 0.8825000 (50)	total: 7.75s	remaining: 1.91s
450:	learn: 0.0813333	test: 0.8990000	best: 0.8825000 (50)	total: 8.73s	remaining: 949ms
499:	learn: 0.0536667	test: 0.9030000	best: 0.8825000 (50)	total: 9.69s	remaining: 0us

bestTest = 0.8825
bestItera

<catboost.core.CatBoostClassifier at 0x7fa3242cf490>

In [188]:
preds = model.predict(data.test_data_no_target)

In [189]:
preds

array([[ 1],
       [ 1],
       [-1],
       ...,
       [ 1],
       [-1],
       [-1]])

In [192]:
preds.reshape(2000)

array([ 1,  1, -1, ...,  1, -1, -1])

In [193]:
def submit(path, predictions):
    # writes submition file
    with open(path, 'w') as f:
        for prediction in predictions.tolist():
            f.write(f"{prediction}\n")  # Writing each prediction on a new line
    
    print("Submission file created:", path)
    return True


submit("submitions/catboost_tryout.txt", preds.reshape(2000))

Submission file created: submitions/catboost_tryout.txt


True

In [177]:
data.test_data_no_target

Unnamed: 0,Group,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,dI49,dI50,dI51,dI52,dI53,dI54,dI55,dI56,dI57,dI58
0,G3,0.039284,-0.034382,-0.040373,0.090023,-0.055953,0.049229,0.016236,-0.045865,0.692039,...,-0.377892,-0.410908,-0.417438,-0.220481,0.002526,0.124543,-0.219341,0.148324,0.020379,-0.004969
1,G9,-0.491713,-0.039484,-0.043612,-0.320312,0.065520,-0.216314,-0.366573,-0.052790,-0.370449,...,0.553799,0.660505,0.806000,0.781506,-0.012519,-0.014169,0.669341,0.015033,0.023411,0.007037
2,G7,-0.744979,-0.038271,-0.038631,-0.824021,0.117907,-1.434879,-0.707810,-0.073176,-1.698337,...,0.707252,0.909110,0.738062,-0.024403,0.032079,0.095528,0.350733,-0.149682,0.005134,1.135389
3,G2,0.694507,-0.026038,-0.038985,1.892921,-0.100102,0.022436,-0.198048,-0.050572,0.208738,...,-0.120561,-0.118927,-0.130250,-0.000468,0.002082,0.023541,-0.044464,0.033332,0.079054,-0.002624
4,G2,0.207123,-0.020462,-0.033451,-0.010191,-0.146812,0.235521,0.044640,-0.029062,0.464286,...,-0.672086,-0.679972,-1.136937,-0.789870,0.007839,-0.424394,-0.704341,0.011889,0.936284,-0.015745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,G9,-1.192924,-0.049079,-0.049806,-0.513692,-0.081669,-0.334987,-0.484038,-0.063376,-0.860113,...,0.280022,0.197899,0.425563,0.146857,0.007887,0.144111,0.556732,0.809250,0.040464,-0.000838
1996,G11,0.761006,-0.028790,-0.040586,-0.098537,0.334470,0.177993,0.733065,-0.032186,2.498567,...,-0.140446,,-0.044562,0.508117,0.007100,0.492209,0.634159,-0.389522,0.030106,0.019739
1997,G4,0.663620,-0.014721,-0.035249,2.367521,0.035822,0.437514,0.241244,-0.033637,0.475282,...,-0.562151,-0.694239,-0.974187,0.052026,0.026136,0.287294,-0.278286,-0.283358,-0.005308,0.001944
1998,G1,-0.887896,-0.031149,-0.044787,,3.235044,9.613638,10.032149,0.096885,0.049668,...,,,,,-0.117762,0.038035,-0.140377,-1.110764,,0.000807


In [None]:
preds = model.predict()

In [164]:
import numpy as np

model = CatBoostClassifier(cat_features=data.cat_cols,
                           eval_metric=CountErrors(),
                           loss_function='MultiClass',
                           early_stopping_rounds = 100,
                           class_weights = [2, 1, 2]
                           )

PARAMS = {
    "metric_period": [50, 60],
    "max_depth": np.arange(4, 20, 1),
    # "subsample": [3, 5, 7]
    "l2_leaf_reg": np.arange(0.1, 1, 0.02),  # np.arange(0.1, 1, 0.05),
    
}
catboost.grid_search(PARAMS, X_train, y_train, cv=5, plot=True, refit=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Failed in nopython mode pipeline (step: nopython frontend)
Untyped global name 'confusion_matrix': Cannot determine Numba type of <class 'function'>

File "../../../var/folders/7y/57xm51hj4y11r4hg1s330bth0000gn/T/ipykernel_20985/257965346.py", line 14:
<source missing, REPL/exec in use?>

  cv_result = self._object._tune_hyperparams(


0:	learn: 0.8866667	test: 0.9108333	best: 0.9108333 (0)	total: 109ms	remaining: 1m 49s
50:	learn: 0.8387500	test: 0.8991667	best: 0.8991667 (50)	total: 542ms	remaining: 10.1s
100:	learn: 0.7912500	test: 0.9091667	best: 0.8991667 (50)	total: 930ms	remaining: 8.27s
150:	learn: 0.7658333	test: 0.9025000	best: 0.8991667 (50)	total: 1.32s	remaining: 7.41s
200:	learn: 0.7279167	test: 0.9108333	best: 0.8991667 (50)	total: 1.71s	remaining: 6.8s
250:	learn: 0.7079167	test: 0.8958333	best: 0.8958333 (250)	total: 2.08s	remaining: 6.21s
300:	learn: 0.6779167	test: 0.8975000	best: 0.8958333 (250)	total: 2.46s	remaining: 5.71s
350:	learn: 0.6504167	test: 0.8991667	best: 0.8958333 (250)	total: 2.83s	remaining: 5.24s
400:	learn: 0.6322917	test: 0.8891667	best: 0.8891667 (400)	total: 3.21s	remaining: 4.79s
450:	learn: 0.6114583	test: 0.9008333	best: 0.8891667 (400)	total: 3.59s	remaining: 4.37s
500:	learn: 0.5881250	test: 0.8925000	best: 0.8891667 (400)	total: 3.99s	remaining: 3.97s
550:	learn: 0.56625

KeyboardInterrupt: 

In [163]:
from sklearn.utils.class_weight import compute_class_weight
 
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train.values), y=y_train.values)
# class_weights = dict(zip(classes, weights))

TypeError: unhashable type: 'numpy.ndarray'

In [161]:
y_train

Unnamed: 0,Class
3836,1
6408,1
4840,1
527,1
6105,1
...,...
5226,1
5390,-1
860,-1
7603,-1


In [168]:
baseline = print(cost_function.evaluate_error([0] * len(y_val),y_val))

0.8655


Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,G10,G11,G2,G3,G4,G5,G6,G7,G8,G9
0,-0.039702,-0.110934,-0.093235,0.089020,-0.048738,0.023660,-0.006359,-0.092094,0.727864,-0.105462,...,0,0,0,1,0,0,0,0,0,0
1,-0.626151,-0.138503,-0.109433,-0.351719,0.072964,-0.250281,-0.393670,-0.123525,-0.311781,-0.125845,...,0,0,0,0,0,0,0,0,0,1
2,-0.905865,-0.131948,-0.084526,-0.892751,0.125449,-1.507389,-0.738919,-0.216046,-1.611120,-0.143551,...,0,0,0,0,0,0,0,1,0,0
3,0.683946,-0.065844,-0.086297,2.025506,-0.092970,-0.003980,-0.223163,-0.113458,0.254954,-0.038914,...,0,0,1,0,0,0,0,0,0,0
4,0.145665,-0.035710,-0.058622,-0.018620,-0.139768,0.215845,0.022379,-0.015832,0.505007,-0.041426,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-1.400589,-0.190351,-0.140412,-0.559429,-0.074503,-0.372709,-0.512515,-0.171568,-0.790917,-0.179268,...,0,0,0,0,0,0,0,0,0,1
1996,0.757389,-0.080713,-0.094304,-0.113512,0.342420,0.156497,0.718898,-0.030013,2.495552,-0.013698,...,0,1,0,0,0,0,0,0,0,0
1997,0.649833,-0.004686,-0.067609,2.535272,0.043210,0.424227,0.221294,-0.036597,0.515767,-0.026384,...,0,0,0,0,1,0,0,0,0,0
1998,-1.063707,-0.093459,-0.115311,0.000000,3.248457,9.890585,10.127324,0.555772,0.099303,-0.070779,...,0,0,0,0,0,0,0,0,0,0
