In [1]:
# data manipulation
import pandas as pd
import numpy as np
import itertools

# visuals
import matplotlib.pyplot as plt
import seaborn as sns

# nlp
import unicodedata
import re
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk.sentiment
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn import naive_bayes as nb
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay,recall_score

# local
import wrangle as w
import model as m

imports loaded successfully, awaiting commands...


# Wrangle

In [2]:
df = w.wrangle_complaints()

parquet file found and loaded


In [3]:
relief = ['Closed with monetary relief', 'Closed with non-monetary relief']
no_relief = ['Closed with explanation']
df = df[df['company_response_to_consumer'] != "Untimely response"]
df = df[df['company_response_to_consumer'] != "Closed"]
df['response'] = np.where(df['company_response_to_consumer'].isin(relief),'relief','')
df['response'] = np.where(df['company_response_to_consumer'].isin(no_relief),'no_relief',df['response'])
df = df.drop(columns='company_response_to_consumer')

In [4]:
train, val, test = w.split_data(df,'response')

Prepared DF: (1238536, 8)
Train: (743121, 8)
Validate: (247707, 8)
Test: (247708, 8)


# Modeling

In [5]:
def encode(df):
    '''Encode categorical columns'''
    # columns to encode
    cols = ['tags','product_bins']
    # encode the dummies
    dummy = pd.get_dummies(df[cols],prefix='',prefix_sep='',drop_first=True)
    # bring the dummies along
    return pd.concat([df,dummy],axis=1)

def process_data_modeling(train, validate, test):
    """
    The function `process_data_modeling` reads data from parquet files, performs data sampling,
    encoding, and splits the data into training, validation, and test sets.
    
    :param train: The `train` parameter is the training dataset, which is a pandas DataFrame containing
    the data for training the model
    :param validate: The `validate` parameter is a DataFrame that contains the validation data. It is
    read from a parquet file named 'validate.parquet'
    :param test: The `test` parameter is a DataFrame that contains the test data for your model. It is
    read from a parquet file named 'test.parquet'
    :return: six variables: X_train, y_train, X_val, y_val, X_test, and y_test.
    """
    random_state = 123
    response_categories = [
        'relief',
        'no_relief'
    ]

    sm_train = []
    sm_val = []
    sm_test = []

    small_train = pd.DataFrame()
    small_val = pd.DataFrame()
    small_test = pd.DataFrame()

    random_state = 123
    percent = .2

    for category in response_categories:
        sm_train.append(int(round(len(train[train.response == category]) * percent, 0)))
        sm_val.append(int(round(len(validate[validate.response == category]) * percent, 0)))
        sm_test.append(int(round(len(test[test.response == category]) * percent, 0)))
        
        small_train = small_train.append(train[train.response == category].sample(sm_train[-1], random_state=random_state))
        small_val = small_val.append(validate[validate.response == category].sample(sm_val[-1], random_state=random_state))
        small_test = small_test.append(test[test.response == category].sample(sm_test[-1], random_state=random_state))

    small_train.reset_index(drop=True, inplace=True)
    small_val.reset_index(drop=True, inplace=True)
    small_test.reset_index(drop=True, inplace=True)

    X_train = encode(small_train)
    X_train = X_train.drop(columns=['date_received','clean','state','company_name','tags','product_bins', 'response'])
    y_train = small_train['response']
    X_val = encode(small_val)
    X_val = X_val.drop(columns=['date_received','clean','state','company_name','tags','product_bins', 'response'])
    y_val = small_val['response']
    X_test = encode(small_test)
    X_test = X_test.drop(columns=['date_received', 'clean','state','company_name','tags','product_bins', 'response'])
    y_test = small_test['response']

    return X_train, y_train, X_val, y_val, X_test, y_test

In [6]:
# X_train, etc...
X_train, y_train, X_val, y_val, X_test, y_test = process_data_modeling(train, val, test)

### vectorizing

In [7]:
def make_mbt_tfidf(Xtr,Xv,Xt):
    """
    The function `make_tfidf` takes in three sets of data (train, validation, and test) and applies the
    TF-IDF vectorization technique to convert the text data into numerical features, using n-grams up to
    trigrams and keeping single characters. It then returns the transformed data as pandas DataFrames.
    
    :param Xtr: Xtr is the training data, which is a dataframe containing the text data that you want to
    transform into TF-IDF features. The "lemmatized" column in the dataframe contains the preprocessed
    text data
    :param Xv: Xv is the validation dataset, which is used to evaluate the performance of the model
    during training
    :param Xt: Xt is the input data for the test set. It is a dataframe containing the text data that
    needs to be transformed into TF-IDF representation
    :return: three dataframes: Xtr_tfidf, Xv_tfidf, and Xt_tfidf.
    """
    #make my bag of words up to trigrams tfidf and keep single characters
    tfidf = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b',lowercase=False, max_features=2900, ngram_range=(1,3))
    # fit and transform train
    Xtr_bow_tfidf = tfidf.fit_transform(Xtr.lemon.astype(str))
    # transform val and test
    Xv_bow_tfidf = tfidf.transform(Xv.lemon.astype(str))
    Xt_bow_tfidf = tfidf.transform(Xt.lemon.astype(str))
    # make dfs
    Xtr_tfidf = pd.DataFrame(Xtr_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xtr.index)
    Xv_tfidf = pd.DataFrame(Xv_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xv.index)
    Xt_tfidf = pd.DataFrame(Xt_bow_tfidf.todense(),columns=tfidf.get_feature_names_out(),index=Xt.index)
    return Xtr_tfidf,Xv_tfidf,Xt_tfidf

def process_vector_merge(X_train,X_val,X_test):
    X_train_tf, X_val_tf, X_test_tf = make_mbt_tfidf(X_train[['lemon']], X_val[['lemon']], X_test[['lemon']])
    
    encoded_train = X_train.iloc[:, 1:]
    encoded_val = X_val.iloc[:, 1:]
    encoded_test = X_test.iloc[:, 1:]
    
    X_train_tfe = encoded_train.merge(X_train_tf, left_index=True, right_index=True)
    X_val_tfe = encoded_val.merge(X_val_tf, left_index=True, right_index=True)
    X_test_tfe = encoded_test.merge(X_test_tf, left_index=True, right_index=True)
    
    # Visualization of train data
    return X_train_tfe, X_val_tfe, X_test_tfe

In [8]:
# vectorize
X_train_tfe, X_val_tfe, X_test_tfe = process_vector_merge(X_train,X_val,X_test)
X_train_tfe

Unnamed: 0,Older American,"Older American, Servicemember",Servicemember,credit_card,credit_report,debt_collection,loans,money_service,mortgage_x,ability,...,year u,year u c,yes,yesterday,yet,yet feel,yet feel like,yet receive,yr,zero
0,0,0,1,0,0,1,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148620,0,0,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148621,0,0,0,0,0,0,0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148622,0,0,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148623,0,0,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
def tree_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through depth,leaf,class_weight for dec tree
    for c,d,l,f in itertools.product(['gini', 'entropy', 'log_loss'],[7,9],range(7,16),["sqrt", "log2", None]):
        # decision tree
        tree = DecisionTreeClassifier(criterion=c,max_depth=d,min_samples_leaf=l,max_features=f,random_state=123)
        tree.fit(Xtr,ytr)
        # predictions
        y_pred_train = tree.predict(Xtr)
        y_pred_val = tree.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = tree.score(Xtr,ytr)
        yv_acc = tree.score(Xv,yv)
        # table-ize
        output ={
                'model':'Decision Tree',
                'params':f"criterion={c},max_depth={d},min_samples_leaf={l},max_features={f},random_state=123",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def forest_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through depth,leaf,class_weight for random forest
    for c,d,l,f in itertools.product(['gini', 'entropy', 'log_loss'],range(10,16),range(1,6),["sqrt", "log2", None]):
        # random forest
        forest = RandomForestClassifier(criterion=c,max_depth=d, min_samples_leaf=l, max_features=f,random_state=123,n_jobs=-1)
        forest.fit(Xtr,ytr)
        # predictions
        y_pred_train = forest.predict(Xtr)
        y_pred_val = forest.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = forest.score(Xtr,ytr)
        yv_acc = forest.score(Xv,yv)
        # table-ize
        output ={
                'model':'Random Forest',
                'params':f"criterion={c},max_depth={d},min_samples_leaf={l},max_features={f},random_state=123,n_jobs=-1",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def log_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through C for log reg
    for c,s in itertools.product([.01,.1,1,10,100,1000],['newton-cholesky', 'sag', 'saga']):
        # logistic regression
        lr = LogisticRegression(C=c,solver=s,random_state=123,max_iter=500,n_jobs=-1)
        lr.fit(Xtr,ytr)
        # predictions
        y_pred_train = lr.predict(Xtr)
        y_pred_val = lr.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = lr.score(Xtr,ytr)
        yv_acc = lr.score(Xv,yv)
        # table-ize
        output ={
                'model':'LogReg',
                'params':f"C={c},solver={s},random_state=123,max_iter=500,n_jobs=-1",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def svc_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through C for svc
    for p,c in itertools.product(['l1', 'l2'],[.01,.1,1,10,100,1000]):
        # svc
        sv = LinearSVC(penalty=p,C=c,dual=False,random_state=123,max_iter=500)
        sv.fit(Xtr,ytr)
        # predictions
        y_pred_train = sv.predict(Xtr)
        y_pred_val = sv.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = sv.score(Xtr,ytr)
        yv_acc = sv.score(Xv,yv)
        # table-ize
        output ={
                'model':'SVC',
                'params':f"penalty={p},C={c},dual=False,random_state=123,max_iter=500",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def sgd_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through C for sgd
    for l,p,es in itertools.product(['hinge', 'log_loss', 'log', 'modified_huber'],['l2','l1','elasticnet'],[True,False]):
        # sgd
        sg = SGDClassifier(loss=l,penalty=p,random_state=123,early_stopping=es,n_jobs=-1)
        sg.fit(Xtr,ytr)
        # predictions
        y_pred_train = sg.predict(Xtr)
        y_pred_val = sg.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = sg.score(Xtr,ytr)
        yv_acc = sg.score(Xv,yv)
        # table-ize
        output ={
                'model':'SGD',
                'params':f"loss={l},penalty={p},random_state=123,early_stopping={es},n_jobs=-1",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)

def mlp_models(Xtr,ytr,Xv,yv):
    metrics = []
    # cycle through C for mlp
    for a,es in itertools.product(['relu', 'identity', 'logistic', 'tanh'],[True,False]):
        # mlp
        ml = MLPClassifier(activation=a,solver='adam',random_state=123,early_stopping=es)
        ml.fit(Xtr,ytr)
        # predictions
        y_pred_train = ml.predict(Xtr)
        y_pred_val = ml.predict(Xv)
        # recall score
        tr_rec = recall_score(ytr, y_pred_train, pos_label='no_relief')
        v_rec = recall_score(yv, y_pred_val, pos_label='no_relief')
        # accuracies
        ytr_acc = ml.score(Xtr,ytr)
        yv_acc = ml.score(Xv,yv)
        # table-ize
        output ={
                'model':'MLP',
                'params':f"activation={a},solver='adam',random_state=123,early_stopping={es}",
                'tr_acc':ytr_acc,
                'v_acc':yv_acc,
                'tr_rec':tr_rec,
                'v_rec':v_rec,
            }
        metrics.append(output)
    return pd.DataFrame(metrics)


In [26]:
def cmd(ytr,y_pred,labels):
    ConfusionMatrixDisplay(confusion_matrix(ytr,y_pred,labels=labels,normalize='true'),display_labels=labels).plot()
    plt.show()

In [27]:
baseline = y_train.value_counts(normalize=True)[0]
baseline

0.7931034482758621

In [28]:
# # decision tree X_train_tfe_mbt,y_train,X_test_tfe_mbt,y_test
# tree = DecisionTreeClassifier(max_depth=9,min_samples_leaf=11,random_state=123)
# tree.fit(X_train_tfe_mbt,y_train)
# ytr_pred = tree.predict(X_train_tfe_mbt)
# tree.score(X_train_tfe_mbt,y_train)

In [29]:
# cmd(y_train,ytr_pred,tree.classes_)

In [30]:
# pd.DataFrame(classification_report(y_train,ytr_pred,output_dict=True))

In [31]:
# pd.DataFrame(classification_report(y_val,tree.predict(X_val_tfe_mbt),output_dict=True))

In [32]:
log_tfe_results = log_models(X_train_tfe,y_train,X_val_tfe,y_val)
log_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,LogReg,"C=0.01,solver=newton-cholesky,random_state=123...",0.793103,0.793125,1.0,1.0
1,LogReg,"C=0.01,solver=sag,random_state=123,max_iter=50...",0.793103,0.793125,1.0,1.0
2,LogReg,"C=0.01,solver=saga,random_state=123,max_iter=5...",0.793103,0.793125,1.0,1.0
3,LogReg,"C=0.1,solver=newton-cholesky,random_state=123,...",0.796347,0.796052,0.993552,0.992772
4,LogReg,"C=0.1,solver=sag,random_state=123,max_iter=500...",0.796347,0.796011,0.993552,0.992721
5,LogReg,"C=0.1,solver=saga,random_state=123,max_iter=50...",0.796347,0.796052,0.993552,0.992772
6,LogReg,"C=1,solver=newton-cholesky,random_state=123,ma...",0.800121,0.797323,0.98059,0.979003
7,LogReg,"C=1,solver=sag,random_state=123,max_iter=500,n...",0.800121,0.797323,0.98059,0.979003
8,LogReg,"C=1,solver=saga,random_state=123,max_iter=500,...",0.800121,0.797303,0.98059,0.978978
9,LogReg,"C=10,solver=newton-cholesky,random_state=123,m...",0.801702,0.796718,0.974168,0.971343


In [33]:
tree_tfe_results = tree_models(X_train_tfe,y_train,X_val_tfe,y_val)
tree_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=7,...",0.794806,0.793004,0.995979,0.994808
1,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=7,...",0.793601,0.793186,0.999550,0.999313
2,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=7,...",0.799859,0.797122,0.993256,0.991550
3,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=8,...",0.794974,0.793064,0.996013,0.994859
4,Decision Tree,"criterion=gini,max_depth=7,min_samples_leaf=8,...",0.793554,0.792943,0.999618,0.999211
...,...,...,...,...,...,...
157,Decision Tree,"criterion=log_loss,max_depth=9,min_samples_lea...",0.793696,0.792923,0.998346,0.997964
158,Decision Tree,"criterion=log_loss,max_depth=9,min_samples_lea...",0.801783,0.796859,0.986316,0.982872
159,Decision Tree,"criterion=log_loss,max_depth=9,min_samples_lea...",0.795721,0.793145,0.994604,0.992568
160,Decision Tree,"criterion=log_loss,max_depth=9,min_samples_lea...",0.793669,0.793064,0.998431,0.998168


In [36]:
svc_tfe_results = svc_models(X_train_tfe,y_train,X_val_tfe,y_val)
svc_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,SVC,"penalty=l1,C=0.01,dual=False,random_state=123,...",0.79305,0.793105,0.999712,0.999669
1,SVC,"penalty=l1,C=0.1,dual=False,random_state=123,m...",0.797497,0.796456,0.990897,0.990431
2,SVC,"penalty=l1,C=1,dual=False,random_state=123,max...",0.800323,0.797364,0.98372,0.98193
3,SVC,"penalty=l1,C=10,dual=False,random_state=123,ma...",0.801265,0.797323,0.982066,0.97964
4,SVC,"penalty=l1,C=100,dual=False,random_state=123,m...",0.801225,0.797202,0.981777,0.97936
5,SVC,"penalty=l1,C=1000,dual=False,random_state=123,...",0.801245,0.797223,0.981743,0.97936
6,SVC,"penalty=l2,C=0.01,dual=False,random_state=123,...",0.79531,0.795446,0.996954,0.996742
7,SVC,"penalty=l2,C=0.1,dual=False,random_state=123,m...",0.798715,0.797263,0.98797,0.987198
8,SVC,"penalty=l2,C=1,dual=False,random_state=123,max...",0.800794,0.797646,0.983262,0.98137
9,SVC,"penalty=l2,C=10,dual=False,random_state=123,ma...",0.801191,0.797485,0.982108,0.979894


In [37]:
sgd_tfe_results = sgd_models(X_train_tfe,y_train,X_val_tfe,y_val)
sgd_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,SGD,"loss=hinge,penalty=l2,random_state=123,early_s...",0.793103,0.793105,1.0,1.0
1,SGD,"loss=hinge,penalty=l2,random_state=123,early_s...",0.793103,0.793105,1.0,1.0
2,SGD,"loss=hinge,penalty=l1,random_state=123,early_s...",0.793103,0.793105,1.0,1.0
3,SGD,"loss=hinge,penalty=l1,random_state=123,early_s...",0.793103,0.793105,1.0,1.0
4,SGD,"loss=hinge,penalty=elasticnet,random_state=123...",0.793103,0.793105,1.0,1.0
5,SGD,"loss=hinge,penalty=elasticnet,random_state=123...",0.793103,0.793105,1.0,1.0
6,SGD,"loss=log_loss,penalty=l2,random_state=123,earl...",0.795546,0.795305,0.994172,0.993332
7,SGD,"loss=log_loss,penalty=l2,random_state=123,earl...",0.794267,0.794033,0.997862,0.997506
8,SGD,"loss=log_loss,penalty=l1,random_state=123,earl...",0.794193,0.794599,0.99491,0.994935
9,SGD,"loss=log_loss,penalty=l1,random_state=123,earl...",0.793507,0.79367,0.997073,0.997073


In [38]:
mlp_tfe_results = mlp_models(X_train_tfe,y_train,X_val_tfe,y_val)
mlp_tfe_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec
0,MLP,"activation=relu,solver='adam',random_state=123...",0.810079,0.800048,0.972615,0.966991
1,MLP,"activation=relu,solver='adam',random_state=123...",0.960478,0.751403,0.977069,0.858241
2,MLP,"activation=identity,solver='adam',random_state...",0.800733,0.797525,0.979029,0.97712
3,MLP,"activation=identity,solver='adam',random_state...",0.801595,0.797667,0.977434,0.97516
4,MLP,"activation=logistic,solver='adam',random_state...",0.80039,0.797909,0.983245,0.981777
5,MLP,"activation=logistic,solver='adam',random_state...",0.800686,0.797303,0.97818,0.975898
6,MLP,"activation=tanh,solver='adam',random_state=123...",0.800794,0.797525,0.978545,0.976611
7,MLP,"activation=tanh,solver='adam',random_state=123...",0.89925,0.747992,0.955326,0.854449


In [39]:
results = pd.concat([
                    log_tfe_results.assign(vector='tfe'),
                    tree_tfe_results.assign(vector='tfe'),
                    svc_tfe_results.assign(vector='tfe'),
                    sgd_tfe_results.assign(vector='tfe'),
                    mlp_tfe_results.assign(vector='tfe'),
                    ])

In [40]:
new_results = results[results.v_acc>baseline].sort_values('v_acc',ascending=False).reset_index(drop=True)
new_results

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec,vector
0,MLP,"activation=relu,solver='adam',random_state=123...",0.810079,0.800048,0.972615,0.966991,tfe
1,MLP,"activation=logistic,solver='adam',random_state...",0.800390,0.797909,0.983245,0.981777,tfe
2,MLP,"activation=identity,solver='adam',random_state...",0.801595,0.797667,0.977434,0.975160,tfe
3,SVC,"penalty=l2,C=1,dual=False,random_state=123,max...",0.800794,0.797646,0.983262,0.981370,tfe
4,MLP,"activation=identity,solver='adam',random_state...",0.800733,0.797525,0.979029,0.977120,tfe
...,...,...,...,...,...,...,...
145,SGD,"loss=hinge,penalty=l2,random_state=123,early_s...",0.793103,0.793105,1.000000,1.000000,tfe
146,SGD,"loss=hinge,penalty=l2,random_state=123,early_s...",0.793103,0.793105,1.000000,1.000000,tfe
147,SGD,"loss=hinge,penalty=l1,random_state=123,early_s...",0.793103,0.793105,1.000000,1.000000,tfe
148,SGD,"loss=hinge,penalty=l1,random_state=123,early_s...",0.793103,0.793105,1.000000,1.000000,tfe


In [43]:
# MLP	activation=relu,solver='adam',random_state=123,early_stopping=True	                    0.810079058	    0.8000484437	0.9726150583	0.966990736
# SVC	penalty=l2,C=1,dual=False,random_state=123,max_iter=500	                                0.8007939445	0.7976464414	0.98326193	0   .9813702535
# Tree	criterion=log_loss,max_depth=7,min_samples_leaf=15,max_features=None,random_state=123	0.7989100084	0.7974445925	0.9954273595	0.9944008959
# Log	C=1,solver=sag,random_state=123,max_iter=500,n_jobs=-1	                                0.8001211102	0.7973234831	0.9805896076	0.9790033595
# SGD	loss=modified_huber,penalty=elasticnet,random_state=123,early_stopping=False,n_jobs=-1	0.7975912532	0.7970005248	0.9838472959	0.9836353456	tfe
pd.set_option("display.max_colwidth", 250)
new_results.head(25)

Unnamed: 0,model,params,tr_acc,v_acc,tr_rec,v_rec,vector
0,MLP,"activation=relu,solver='adam',random_state=123,early_stopping=True",0.810079,0.800048,0.972615,0.966991,tfe
1,MLP,"activation=logistic,solver='adam',random_state=123,early_stopping=True",0.80039,0.797909,0.983245,0.981777,tfe
2,MLP,"activation=identity,solver='adam',random_state=123,early_stopping=False",0.801595,0.797667,0.977434,0.97516,tfe
3,SVC,"penalty=l2,C=1,dual=False,random_state=123,max_iter=500",0.800794,0.797646,0.983262,0.98137,tfe
4,MLP,"activation=identity,solver='adam',random_state=123,early_stopping=True",0.800733,0.797525,0.979029,0.97712,tfe
5,MLP,"activation=tanh,solver='adam',random_state=123,early_stopping=True",0.800794,0.797525,0.978545,0.976611,tfe
6,SVC,"penalty=l2,C=10,dual=False,random_state=123,max_iter=500",0.801191,0.797485,0.982108,0.979894,tfe
7,Decision Tree,"criterion=log_loss,max_depth=7,min_samples_leaf=15,max_features=None,random_state=123",0.79891,0.797445,0.995427,0.994401,tfe
8,Decision Tree,"criterion=entropy,max_depth=7,min_samples_leaf=15,max_features=None,random_state=123",0.79891,0.797445,0.995427,0.994401,tfe
9,Decision Tree,"criterion=entropy,max_depth=7,min_samples_leaf=7,max_features=None,random_state=123",0.79924,0.797404,0.995885,0.994426,tfe
