In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv',
                skipinitialspace=True)
x_cols = [c for c in df.columns if c != 'income']
X = df[x_cols]
y = df['income']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [9]:
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print (train_mode)

{'age': 31.0, 'workclass': 'Private', 'fnlwgt': 121124, 'education': 'HS-grad', 'education-num': 9.0, 'marital-status': 'Married-civ-spouse', 'occupation': 'Prof-specialty', 'relationship': 'Husband', 'race': 'White', 'sex': 'Male', 'capital-gain': 0.0, 'capital-loss': 0.0, 'hours-per-week': 40.0, 'native-country': 'United-States'}


In [10]:
encoder = {}
for col in ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']:
    cat_convert = LabelEncoder()
    X_train[col] = cat_convert.fit_transform(X_train[col])
    encoder[col] = cat_convert

In [11]:
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
et = ExtraTreesClassifier(n_estimators=100).fit(X_train, y_train)

In [13]:
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoder, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']

In [3]:
data = pd.read_csv('Medicalpremium.csv')
data.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice
0,45,0,0,0,0,155,57,0,0,0,25000
1,60,1,0,0,0,180,73,0,0,0,29000
2,36,1,1,0,0,158,59,0,0,1,23000
3,52,1,1,0,1,183,93,0,0,2,28000
4,38,0,0,0,1,166,88,0,0,1,23000


In [4]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X.shape

(986, 10)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print (train_mode)

{'Age': 45.0, 'Diabetes': 0.0, 'BloodPressureProblems': 0.0, 'AnyTransplants': 0.0, 'AnyChronicDiseases': 0.0, 'Height': 174.0, 'Weight': 70.0, 'KnownAllergies': 0.0, 'HistoryOfCancerInFamily': 0.0, 'NumberOfMajorSurgeries': 0.0}


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

n_train = int(len(X)*0.7)
classifiers = [RandomForestClassifier(),
               SVC(kernel='linear'),
               DecisionTreeClassifier(),
               LogisticRegression(solver="liblinear", max_iter=100),
               SGDClassifier(),
              GaussianNB(),
              KNeighborsClassifier(),
              MLPClassifier()]

for model in classifiers:
    print("{} accuracy: {}".format(model.__class__.__name__, cross_val_score(model, X_train, y_train, scoring="accuracy").mean()))



RandomForestClassifier accuracy: 0.8956521739130435
SVC accuracy: 0.8028985507246377
DecisionTreeClassifier accuracy: 0.8782608695652174
LogisticRegression accuracy: 0.7333333333333334
SGDClassifier accuracy: 0.2753623188405797
GaussianNB accuracy: 0.21449275362318837
KNeighborsClassifier accuracy: 0.5840579710144927
MLPClassifier accuracy: 0.7333333333333334


In [8]:
rf = RandomForestClassifier().fit(X_train, y_train)
dt = DecisionTreeClassifier().fit(X_train, y_train)
joblib.dump(train_mode, "./pi_train_mode.joblib", compress=True)
joblib.dump(rf, "./pi_random_forest.joblib", compress=True)
joblib.dump(dt, "./pi_decision_tree.joblib", compress=True)

['./pi_decision_tree.joblib']

In [22]:
prob = rf.predict_proba(pd.DataFrame(train_mode, index=[0]))[0]
np.where(prob = np.max(prob))

TypeError: where() got an unexpected keyword argument 'prob'