In [106]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from matplotlib import pyplot as plt


In [5]:
import pandas as pd
data =  pd.read_csv('data/wine/wine.data', sep=",")

In [82]:
col_name = ['age','workclass','fnlwgt','education','education_num','marital_status','occupation',
            'relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country',
            'high_income']
income = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
            names=col_name)
income = {"data":income, "y":'high_income',"name" : 'income'}

In [83]:
col_name = ['Industrial Risk','Management Risk','Financial Flexibility', 'Credibility', 'Competitiveness', 'Operating Risk', 'Class']
Bankruptcy =  pd.read_csv('data/Qualitative_Bankruptcy/Qualitative_Bankruptcy.data.txt', sep=",",names=col_name)
Bankruptcy = {"data":Bankruptcy, "y":'Class',"name" : 'Bankruptcy'}

In [84]:
col_name = ['class', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
Wine =  pd.read_csv('data/wine/wine.data', sep=",", names = col_name)
Wine = {"data":Wine, "y":'class',"name" : 'Wine'}

In [85]:
titanic =  pd.read_csv('data/titanic/train.csv')
titanic = titanic.loc[:,['Survived','Pclass','Sex','Age','Parch','Fare','Embarked']]
titanic = {"data":titanic, "y":'Survived',"name" : 'titanic'}

In [125]:
data = [income, Bankruptcy, Wine, titanic]

In [126]:
for datum in data:
    output = []
    for fraction in [0.1*x for x in range(1,10)]:
        df = datum['data']
        y_name = datum['y']
        df = df.dropna(axis='index')
        X = df.drop([y_name], axis = 1)
        y = df[y_name]
        
        features_to_encode = X.columns[X.dtypes==object].tolist() 
        
        col_trans = make_column_transformer(
                            (OneHotEncoder(),features_to_encode),
                            remainder = "passthrough"
                            )    
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

        model = RandomForestClassifier(random_state=1,max_features=fraction)
        pipe = make_pipeline(col_trans, model)

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        output.append(classification_report(y_test, y_pred,output_dict=True)['weighted avg']['f1-score'])
            
        # model2 = RandomForestClassifier(random_state=1)
        # pipe = make_pipeline(col_trans, model2)

        # pipe.fit(X_train, y_train)
        # y_pred2 = pipe.predict(X_test)
        # print(classification_report(y_test, y_pred2))
    plt.plot(output)
    plt.plot(output, color='#e35f62', marker='*', linewidth=2)
    plt.xticks([x for x in range(9)], labels = [round(0.1*x,2) for x in range(1,10)])
    plt.savefig(f'{datum["name"]}.png')
    plt.clf()


<Figure size 432x288 with 0 Axes>

In [138]:
data = [income, Bankruptcy, Wine, titanic]
output = []
for datum in data:
    criterion = ['gini','entropy']    
    for crit in criterion:
        df = datum['data']
        y_name = datum['y']
        df = df.dropna(axis='index')
        X = df.drop([y_name], axis = 1)
        y = df[y_name]
        
        features_to_encode = X.columns[X.dtypes==object].tolist() 
        
        col_trans = make_column_transformer(
                            (OneHotEncoder(),features_to_encode),
                            remainder = "passthrough"
                            )    
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

        model = RandomForestClassifier(random_state=1,criterion=crit)
        pipe = make_pipeline(col_trans, model)

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        output.append(classification_report(y_test, y_pred,output_dict=True)['weighted avg']['f1-score'])
