In [1]:
import pandas as pd 
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier 
from sklearn.tree import DecisionTreeRegressor 
from sklearn import metrics 
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.ensemble.forest import RandomForestRegressor
import os
import re

In [2]:
train_data_set = pd.read_csv("data/dataset_tae_final_no_na_mod.csv", encoding = "ISO-8859-1")
test_data_set = pd.read_csv("data/test_tae_no_na_mod.csv", encoding = "ISO-8859-1")
test_data_set.tail()

Unnamed: 0,age,workclass,fnlwgt,education,marital_status,ocupation,ethnicity,gender,capital_gain,capital_loss,hours_per_week,native_country,income
15054,33,Private,245211,Bachelors,Never-married,Prof-specialty,White,Male,0,0,40,United-States,<=50K.
15055,39,Private,215419,Bachelors,Divorced,Prof-specialty,White,Female,0,0,36,United-States,<=50K.
15056,38,Private,374983,Bachelors,Married,Prof-specialty,White,Male,0,0,50,United-States,<=50K.
15057,44,Private,83891,Bachelors,Divorced,Adm-clerical,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.
15058,35,Self-emp-inc,182148,Bachelors,Married,Exec-managerial,White,Male,0,0,60,United-States,>50K.


In [3]:
test_data_set["income"]=test_data_set["income"].replace(" <=50K.", " <=50K")
test_data_set["income"]=test_data_set["income"].replace(" >50K.", " >50K")

In [4]:
test_data_set["income"].tail()

15054     <=50K
15055     <=50K
15056     <=50K
15057     <=50K
15058      >50K
Name: income, dtype: object

In [22]:
class TAERandomForestClassifier(object):
    lab_encoders = {}
    dummy_encoder = None
    rfc_model = None
    n_estimators = 500
    max_features = 5
    max_depth = 16
    
    def encode_fit(self, cat_data):
        #Encodes string to numeric labels
        tdc_set_encoded = cat_data.copy(deep=True)
        for cn in cat_data.columns:
            self.lab_encoders[cn] = preprocessing.LabelEncoder()
            self.lab_encoders[cn].fit(cat_data[str(cn)])
            tdc_set_encoded[str(cn)] = self.lab_encoders[cn].transform(cat_data[str(cn)])
        
        #Encodes to dummy dataset
        self.dummy_encoder = preprocessing.OneHotEncoder(categories="auto")
        self.dummy_encoder.fit(tdc_set_encoded[cat_data.columns])
        
        #print(len(self.dummy_encoder.get_feature_names()))
        
        encoded_cat_data = pd.DataFrame(data=self.dummy_encoder.transform(tdc_set_encoded).todense(), columns=self.dummy_encoder.get_feature_names())
        return encoded_cat_data
    
    def encode(self, cat_data):
        for cn in cat_data.columns:
              cat_data[str(cn)] = self.lab_encoders[cn].transform(cat_data[str(cn)]) 
        
        
        #Encodes to dummy dataset
        encoded_cat_data = pd.DataFrame(data=self.dummy_encoder.transform(cat_data).todense(), columns=self.dummy_encoder.get_feature_names())    
        return encoded_cat_data       
    def fit(self, x_train, y_train, cat_cols, num_cols):
        #Separates dataset in categorical and numbers
        x_train_num = x_train[num_cols].copy(deep=True)
        x_train_cat = x_train[cat_cols].copy(deep=True)
        
        x_train_cat = self.encode_fit(x_train_cat)
        
        x_train_num.reset_index(drop=True, inplace=True)
        x_train_cat.reset_index(drop=True, inplace=True)
        
        f_x_train = pd.concat([x_train_num, x_train_cat], axis=1)

        self.rfc_model = RandomForestClassifier(n_estimators=self.n_estimators, criterion="entropy", 
                                                max_features=self.max_features, max_depth=self.max_depth)
        self.rfc_model = self.rfc_model.fit(f_x_train, y_train)
        
    def predict(self, x_predict, cat_cols, num_cols):
        #Separates dataset in categorical and numbers
        x_predict_num = x_predict[num_cols].copy(deep=True)
        x_predict_cat = x_predict[cat_cols].copy(deep=True)
        
        x_predict_cat = self.encode(x_predict_cat)
        f_x_predict = pd.concat([x_predict_num, x_predict_cat], axis=1)
        y_pred = self.rfc_model.predict(f_x_predict)
        return y_pred
    
    def cal_conf_matrix(self, x_test, y_test, catego_columns, numeric_cols):
        y_pred = self.predict(x_test, catego_columns, numeric_cols)
        # [[VP, FP], [FN, VN]]
        print("Matriz de confusión:")
        print(metrics.confusion_matrix(y_test, y_pred))

        #Correr varias veces y ver como varia. Basado en el indice de jaccard
        print("Precisión:", metrics.accuracy_score(y_test, y_pred))
        
        return metrics.accuracy_score(y_test, y_pred)
        
    def set_meta(self, max_features, n_estimators, max_depth):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_depth = max_depth


In [23]:
catego_columns = ['education',
 'workclass',
 'marital_status',
 'ocupation',
 'ethnicity',
 'gender',
 'native_country']

numeric_cols = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week']

forest = TAERandomForestClassifier()
forest.fit(train_data_set.loc[:,train_data_set.columns!="income",],train_data_set["income"], catego_columns, numeric_cols)

In [24]:
y_pred = forest.predict(test_data_set.loc[:,test_data_set.columns!="income",], catego_columns, numeric_cols)


In [25]:
m = forest.cal_conf_matrix(test_data_set.loc[:,test_data_set.columns!="income",], test_data_set["income"], catego_columns, numeric_cols)
len(train_data_set.columns!="income")

Matriz de confusión:
[[10800   559]
 [ 1563  2137]]
Precisión: 0.8590875888173185


13

In [475]:
acc = []
from tqdm import tqdm

for max_features in tqdm(range(1,len(train_data_set.columns!="income"),1)):
    for n_estimators in tqdm(range(150, 500, 50)):
        for max_depth in range(5,10,1):
            forest.set_meta(max_features, n_estimators, max_depth)
            forest.fit(train_data_set.loc[:,train_data_set.columns!="income",],train_data_set["income"], catego_columns, numeric_cols)
            acc.append({'max_features': max_features, 'n_estimators': n_estimators, 'max_depth': max_depth, 'accuracy': forest.cal_conf_matrix(test_data_set.loc[:,test_data_set.columns!="income",], test_data_set["income"], catego_columns, numeric_cols)})







  0%|                                                                                           | 0/12 [00:00<?, ?it/s]





  0%|                                                                                            | 0/7 [00:00<?, ?it/s]





 14%|████████████                                                                        | 1/7 [00:09<00:54,  9.06s/it]





 29%|████████████████████████                                                            | 2/7 [00:22<00:52, 10.41s/it]





 43%|████████████████████████████████████                                                | 3/7 [00:37<00:46, 11.75s/it]





 57%|████████████████████████████████████████████████                                    | 4/7 [00:53<00:39, 13.18s/it]





 71%|████████████████████████████████████████████████████████████                        | 5/7 [01:13<00:30, 15.14s/it]





 86%|████████████████████████████████████████████████████████████████████████            | 6/7 [01:35<00:17, 17.17s/it]


KeyboardInterrupt: 

In [417]:
test_data_set["income"][1]

'>50K'

In [420]:
test_data_set["income"][1]==y_pred[1]

True

In [421]:
train_data_set["income"][1]

' <=50K'

In [426]:
# [[VP, FP], [FN, VN]]
print("Matriz de confusión:")
print(metrics.confusion_matrix(test_data_set["income"], y_pred))

#Correr varias veces y ver como varia. Basado en el indice de jaccard
print("Precisión:",metrics.accuracy_score(test_data_set["income"], y_pred))
#print(metrics.precision_score(y_test, y_pred))

Matriz de confusión:
[[10453   906]
 [ 1434  2266]]
Precisión: 0.8446111959625473
