In [1]:
import os
import re
import pandas as pd
import numpy as np
from utils.myutils import save_object, load_object
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

In [8]:
min_records_per_class = 50
max_upsampling_factor = 20
save_models = False
max_sample = None
only_train_on_entire_set = False #Works only with previously stored confidence_threshold

In [3]:
def balance_training(df, opt_params=False):
    # Separate majority and minority classes
    df_majority = df[df.Label==False]
    df_minority = df[df.Label==True]
    
    if df_majority.shape[0] < min_records_per_class or df_minority.shape[0] < min_records_per_class:
        return None
    
    if df_majority.shape[0] == df_minority.shape[0]:
        return df
    
    if not opt_params:
        print("Resampling (" + str(df_majority.shape[0]) + "," + str(df_minority.shape[0]) + ")")
    
    #Will probably not occur but who knows...
    if df_majority.shape[0] < df_minority.shape[0]:
        print("Strange Format!")
        df_majority = df[df.Label==True]
        df_minority = df[df.Label==False]
    
    n_samples_new = df_majority.shape[0]
    if n_samples_new > (df_minority.shape[0] * max_upsampling_factor):
        n_samples_new = df_minority.shape[0] * max_upsampling_factor
        
    # Upsample minority class
    df_minority_resampled = resample(df_minority, 
                                     replace=True,            # sample with replacement
                                     n_samples=n_samples_new, # to match majority class
                                     random_state=123)        # reproducible results
    
    #Maximum Upsampling reached - Downsampling the rest
    if not df_majority.shape[0] == df_minority_resampled.shape[0]:
        df_majority_resampled = resample(df_majority, 
                                         replace=True,                             # sample with replacement
                                         n_samples=df_minority_resampled.shape[0], # to match minority class
                                         random_state=123)                         # reproducible results
    else:
        df_majority_resampled = df_majority

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority_resampled, df_minority_resampled])

    # Display new class counts
    #print(df_upsampled.Label.value_counts())
    if not opt_params:
        print("Resampled to (" + str(df_majority_resampled.shape[0]) + "," + str(df_minority_resampled.shape[0]) + ")")
    return df_upsampled


In [4]:
def print_performance( labels_actual, labels_predicted ):
    accur = accuracy_score(labels_actual, labels_predicted)
    conf_matrix = confusion_matrix(labels_actual, labels_predicted)
    class_report = classification_report(labels_actual, labels_predicted)
    
    print("Accuracy: " + str( accur ))
    print()
    try:
        print("             Predicted False \t Predicted True")
        print("Actual False " + str( conf_matrix[0][0] ) + "  \t\t " + str( conf_matrix[0][1] ))
        print("Actual True  " + str( conf_matrix[1][0] ) + "  \t\t " + str( conf_matrix[1][1] ))
    except IndexError:
        print('WARNING: Just a single class has been found!')
        print(conf_matrix)
    print()
    print(class_report)


In [5]:
def valid_list(list_to_check):
    count_true = 0
    count_false = 0
    #Count True
    for elem in list_to_check:
        if elem == True:
            count_true += 1
        if count_true >= min_records_per_class:
            for elem in list_to_check:
                if elem == False:
                    count_false += 1
                if count_false >= min_records_per_class:
                    return True
            return False
    return False

In [9]:
directory = os.fsencode("data")
directory_name = directory.decode("utf-8")

#Paramter
opt_params = False

opt_params_relevant = ['elevation', 'height', 'numberOfEpisodes', 'numberOfStudents', 'populationDensity', 'populationTotal']
opt_params_ddict = dict()

param_grid = { 'criterion': ['gini','entropy'],
               'max_depth': [20,40,100,None],
               'min_samples_split': [10,20,40,80,160],
               'min_samples_leaf': [1,5,20,100],
               'min_impurity_decrease': [0.0, 0.000001, 0.01] }

#Random Forest & Extra Trees
param_grid = { 'n_estimators': [20,10],
               'criterion': ['gini','entropy'],
               'max_depth': [20,80,None],
               'min_samples_split': [2,8,20,100],
               'min_samples_leaf': [1,5,20,50],
               'min_impurity_decrease': [0.0, 0.000001, 0.01] }

#Bagging
param_grid = { 'n_estimators': [20,10],
               'max_features': [0.1,0.2,0.5,0.8,1.0],
               'max_samples': [0.1,0.2,0.5,0.8,1.0],
               'bootstrap': [True, False],
               'bootstrap_features': [True, False] }

#Adaboost
param_grid = { 'n_estimators': [20,50,100,200],
               'learning_rate': [0.001,0.01,0.1,0.2,0.4,0.6,0.8,1.0] }

param_grid = { 'n_estimators': [2,24,48,64,72,96,128,192] }

best_avg_precision = 0.0
precisions = list()

for params in ParameterGrid(param_grid):
  
    if opt_params:
        print('----------')
        print('Parameters: ' + str( params ))

    for file in os.listdir(directory):
        file_name = os.fsdecode(file)
        if not file_name.endswith(".csv"):
            continue
        
        
        #Only optimize on interesting relations
        if opt_params and file_name[:-4] not in opt_params_relevant:
            continue
        
        print('---' + file_name + '---')
        
        if only_train_on_entire_set:
            try:
                load_object('models/' + file_name[:-4] + '_confidence')
            except FileNotFoundError:
                print("No confidence threshold for " + file_name)
                continue
        
        if not opt_params or file_name not in opt_params_ddict:

            try:
                dtypes = load_object('data_info/' + file_name[:-4] + '_dtypes')
                if type(dtypes) != dict:
                    dtypes = dtypes.to_dict()
            except (FileNotFoundError, AttributeError):
                print('WARNING! No dtype information available!')
                dtypes = None

            data = pd.read_csv(directory_name + "/" + file_name,
                               encoding = "utf-8",
                               dtype = dtypes,
                               sep = ',')
            opt_params_ddict[file_name] = data
            
        else:
            data = opt_params_ddict[file_name]

        
        
        #Drop Info-Features
        info_features = [col for col in list(data.columns) if col.startswith('Info')]
        data_info = data[info_features]
        data = data.drop(info_features, axis=1)
        
        #Some algorithms have problems with certain characters in feature names
        data.columns = [re.sub('[\[\]<>]', 'X', elem) for elem in data.columns]
        
        #TODO!!
        #data['StandardDeviationFactor'] = data['StandardDeviationFactor'].apply(abs)        
        
        if(data.shape[0] == 0):
            if not opt_params:
                print("No data")
            continue

        if not opt_params:
            prt_true_ins = data[data["Label"] == True].shape[0]
            prt_all_ins = data.shape[0]
            print(str(prt_true_ins) + " True instances (" + str(prt_all_ins) + " in total -> " + str(prt_true_ins / prt_all_ins * 100) + " %)")
        
        #print(str(data[data["Label"] == True].shape[0] / data.shape[0]) + " %")

        if not only_train_on_entire_set:
            try:
                train_test_data = train_test_split(data, test_size=0.25, random_state=123, stratify=data['Label'])
            except ValueError:
                train_test_data = train_test_split(data, test_size=0.25, random_state=123)
                print('ValueError while splitting train/test data! No stratified sampling applied!')
            train_data = train_test_data[0]
            test_data = train_test_data[1]
        else:
            train_data = data
        
        #print(str(train_data[train_data["Label"] == True].shape[0] / train_data.shape[0]) + " %")
        #print(str(test_data[test_data["Label"] == True].shape[0] / test_data.shape[0]) + " %")
        
        train_data = balance_training(train_data, opt_params)
        if(train_data is None):
            if not opt_params:
                print("Not enough data")
            continue
            
        if max_sample is not None and train_data.shape[0] > max_sample:
            train_data = train_data.sample(n=max_sample, random_state=123)
            print("Sampled train to " + str(max_sample))

        train_labels = train_data["Label"].tolist()
        train_data = train_data.drop("Label", axis=1)
        if not only_train_on_entire_set:
            test_labels = test_data["Label"].tolist()
            test_data = test_data.drop("Label", axis=1)
            
            if not valid_list(test_labels):
                print('Not enough data')
                continue
        
        
        #Normalize Features to mean 0 and variance 1
        #scaler = StandardScaler()
        #scaler.fit(train_data)  # Don't cheat - fit only on training data
        #train_data = scaler.transform(train_data)
        #test_data = scaler.transform(test_data)  # apply same transformation to test data

        #Old: {'criterion': 'gini', 'max_depth': 1000, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 10}
        base_learner = DecisionTreeClassifier(criterion='entropy', max_depth=80, min_impurity_decrease=1e-06, min_samples_leaf=1, min_samples_split=10, random_state=123)
        #learner = base_learner
        
        #learner = SGDClassifier(loss='log', max_iter=np.ceil(10**6 / len(train_labels)), random_state=123)
        #learner = MultinomialNB()
        #learner = SVC(C=0.8, cache_size=4000, probability=True)
        learner = RandomForestClassifier(n_estimators=96, criterion='gini', max_depth=80, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, random_state=123)
        #learner = ExtraTreesClassifier(n_estimators=20, criterion='entropy', max_depth=80, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=8, random_state=123)
        #learner = BaggingClassifier(base_learner, bootstrap=False, bootstrap_features=False, max_samples=1.0, max_features=0.5, n_estimators=20, random_state=123)
        #learner = AdaBoostClassifier(n_estimators=200, random_state=123)
        #learner = XGBClassifier(n_estimators=2000, learning_rate=0.4, max_depth=20, min_child_weight=1, gamma=0.15, subsample=0.95, colsample_bytree=0.65, scale_pos_weight=1, reg_lambda=0.9, reg_alpha=1e-5, random_state=123)
        
        if opt_params:
            learner.set_params(**params)
        learner = learner.fit(train_data, train_labels)

        
        if only_train_on_entire_set:
            save_object(learner, 'models/' + file_name[:-4]) #Strip ".csv"
            save_object(train_data.columns, 'models/' + file_name[:-4] + '_features')
            print('Successfully stored new model and columns.')
        
        #feature_importances_dict = dict()
        #feature_importances = learner.feature_importances_
        #
        #for i, fi in enumerate(feature_importances):
        #    feature_importances_dict[data.columns[i]] = fi
        # 
        #print(sorted( ((v,k) for k,v in feature_importances_dict.items()), reverse=True))
        
        #Parameter
        min_precision = 0.95
        
        if opt_params:
            save_models = False
            min_precision = 0.0

        test_confidences = learner.predict_proba(test_data)

        zipped = list(zip(test_confidences, test_labels))
        if not opt_params:
            print("Length of Test: " + str(len(zipped)))

        confidence_threshold = 0.5
        first = True
        while True:
            #TODO: Sollte man nur die "Wahren entfernen"?
            #zipped = [example for example in zipped if example[0][0] >= confidence_threshold or example[0][1] >= confidence_threshold]

            
            #(Prediction, Actual)
            #TODO: Hier war vorher >0.5 anstatt >= confidence_threshold
            zipped_predictions = [((example[0][1] >= confidence_threshold), example[1]) for example in zipped]
            
            test_labels = [example[1] for example in zipped_predictions]
            test_labels_predicted = [example[0] for example in zipped_predictions]
            
            current_precision = precision_score(test_labels, test_labels_predicted)

            if opt_params:
                precisions.append(current_precision)
                break
            
            print("For confidence " + str(confidence_threshold) + " reduced to " + str(len(zipped)) + "(" + \
                  str(len([l for l in test_labels if l == True])) + "," + str(len([l for l in test_labels if l == False])) + \
                  "). Precision: " + str(current_precision) + " / Recall: " + str(recall_score(test_labels, test_labels_predicted)))
    
            #Not enough instances left any more
            #TODO: Hier war vorher test_labels anstatt test_labels_predicted
            if not valid_list(test_labels_predicted):
                print("No model could be found! (Minimum number of true examples reached)")
                #print("Best Precision: " + str(precision_score(test_labels, test_labels_predicted)))
                break
        
            if current_precision >= min_precision:
                print("Model found at confidence " + str(confidence_threshold))
                print_performance(test_labels, test_labels_predicted)
                if save_models:
                    save_object(learner, 'models/' + file_name[:-4]) #Strip ".csv"
                    save_object(confidence_threshold, 'models/' + file_name[:-4] + '_confidence')
                    save_object(train_data.columns, 'models/' + file_name[:-4] + '_features')
                    print('Successfully stored model, confidence and columns.')
                break
            
            #Even removing on 1.0 is still not sufficient
            if confidence_threshold >= 1.0:
                print("No model could be found! (Confidence 1.0 reached)")
                #print("Best Precision: " + str(precision_score(test_labels, test_labels_predicted)))
                break

            first = False
            confidence_threshold = round((confidence_threshold + 0.01), 2)

    if opt_params:
        
        #running_time = ((time.clock() - start)*1000)
        #print("Time: " + str(running_time))
        
        if len(precisions) > 0:
            print(str(precisions))
            avg_precision = np.mean(np.array(precisions))
            if avg_precision > best_avg_precision:
                print('New Optimum found:' + str(avg_precision))
                print(str(params))
                best_avg_precision = avg_precision

        precisions = list()
    else:
        break
    
    print('-----')



---area.csv---
96 True instances (25966 in total -> 0.3697142417006855 %)
Resampling (19402,72)
Resampled to (1440,1440)
Length of Test: 6492
For confidence 0.5 reduced to 6492. Precision: 0.078125
No model could be found! (Minimum number of true examples reached)
---areaLand.csv---
2 True instances (131097 in total -> 0.001525587923445998 %)
Not enough data
---areaTotal.csv---
92 True instances (410292 in total -> 0.022423054799996102 %)
Resampling (307650,69)
Resampled to (1380,1380)
Length of Test: 102573
For confidence 0.5 reduced to 102573. Precision: 0.020021074815595362
No model could be found! (Minimum number of true examples reached)
---areaWater.csv---
83 True instances (125117 in total -> 0.0663379077183756 %)
Resampling (93775,62)
Resampled to (1240,1240)
Length of Test: 31280
For confidence 0.5 reduced to 31280. Precision: 0.010626992561105207
No model could be found! (Minimum number of true examples reached)
---elevation.csv---
18576 True instances (398376 in total -> 4.6

Resampling (36623,670)
Resampled to (13400,13400)
Sampled train to 20000
Length of Test: 12432
For confidence 0.5 reduced to 12432. Precision: 0.22365591397849463
Model found at confidence 0.5
Accuracy: 0.9613095238095238

             Predicted False 	 Predicted True
Actual False 11847  		 361
Actual True  120  		 104

              precision    recall  f1-score   support

       False       0.99      0.97      0.98     12208
        True       0.22      0.46      0.30       224

   micro avg       0.96      0.96      0.96     12432
   macro avg       0.61      0.72      0.64     12432
weighted avg       0.98      0.96      0.97     12432

---shipBeam.csv---
279 True instances (12540 in total -> 2.2248803827751193 %)
Resampling (9196,209)
Resampled to (4180,4180)
Length of Test: 3135
For confidence 0.5 reduced to 3135. Precision: 0.46808510638297873
Model found at confidence 0.5
Accuracy: 0.9757575757575757

             Predicted False 	 Predicted True
Actual False 3015  		 50
Actual