# let's import libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score
import collections

# let's extract train

In [2]:
df = pd.read_csv('train.csv')
df.head(5)

Unnamed: 0,id,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,21983,18,Private,141363,Some-college,10,Never-married,Other-service,Not-in-family,White,Female,0,0,40,United-States,0
1,10063,38,Private,179579,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,?,1
2,16273,63,Private,172740,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,0
3,2208,35,Private,47707,HS-grad,9,Never-married,Other-service,Not-in-family,Black,Female,0,0,40,United-States,1
4,15914,32,Private,48458,HS-grad,9,Never-married,Sales,Own-child,Black,Female,0,1669,45,United-States,0


# let's transform data

In [3]:
def transform_df(df):
    df = df.rename(columns=lambda x: x.strip())
    n_col = []
    c_col = []
    for k,v in df.dtypes.to_dict().items():
        if v == 'int64':
            n_col.append(k)
        else:
            c_col.append(k)

    df[c_col] = df[c_col].apply(lambda x: x.str.strip(), axis=1)

    #sex
    df.sex.unique()
    df['sex_b'] = df['sex'].map({'Male':1, 'Female':0})

    #married
    df['marital-status'].unique()
    #не была замужем, разведенка, вдова, разбежались 
    one = ['Never-married','Divorced','Widowed','Separated']
    #так или иначе женатики, гражданский и прочий браки
    two = ['Married-civ-spouse','Married-spouse-absent','Married-AF-spouse']
    df['marital-status_b'] = df['marital-status'].apply(lambda x: 1 if x in one else 0)
    
    return df, c_col

In [4]:
df, c_col = transform_df(df)

# let's try model

In [5]:
df.drop(labels=c_col, axis = 1, inplace = True)

Y = df.target.values
X = df.drop(['id','target'], axis = 1).values
size = 0.2
seed = 10

X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=size,random_state=seed)

kfold = KFold(n_splits = 10, random_state = seed)

cv_res = cross_val_score(RandomForestClassifier(n_estimators = 100, max_features = 3), X_train, Y_train, cv=kfold, scoring = 'accuracy')

In [6]:
print(cv_res.mean(), cv_res.std())

0.7322888165609308 0.007333994347670994


# let's try to impove

In [None]:
n_estimators = np.array([50,100,200,250])
max_features = np.array([1,3,5])

grid_p = dict(n_estimators = n_estimators, max_features = max_features)
md = RandomForestClassifier()
kfold = KFold(10, random_state = 10)

grid_cv = GridSearchCV(estimator = md, param_grid = grid_p, scoring = 'accuracy', cv = kfold)
grid_res = grid_cv.fit(X_train, Y_train)

means = grid_res.cv_results_['mean_test_score']
stds = grid_res.cv_results_['std_test_score']
params = grid_res.cv_results_['params']

In [7]:
for m,s,p in zip(means,stds,params):
    print(p,m,s)

NameError: name 'means' is not defined

In [8]:
md = RandomForestClassifier(n_estimators=250,max_features=5)
md.fit(X_train, Y_train)
prd = md.predict(X_validation)

print((100*accuracy_score(Y_validation, prd)))

72.26970560303894


# let's check test

In [9]:
df_t = pd.read_csv('test.csv')
df_t.head(5)

Unnamed: 0,id,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,18738,22,Private,110684,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States
1,14083,26,Local-gov,242464,HS-grad,9,Married-civ-spouse,Protective-serv,Husband,White,Male,3103,0,40,United-States
2,6172,47,Private,170850,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,4064,0,60,United-States
3,2821,32,Federal-gov,454508,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States
4,11725,37,Federal-gov,194630,Masters,14,Never-married,Exec-managerial,Own-child,White,Female,0,0,40,United-States


In [10]:
df_t, c_col_t = transform_df(df_t)

In [11]:
df_t.drop(labels=c_col_t, axis = 1, inplace = True)

X_t = df_t.drop(['id'], axis = 1).values

prd_t = md.predict(X_t)

In [12]:
collections.Counter(prd_t)

Counter({0: 6418, 1: 351})

In [13]:
df_t['target'] = prd_t.tolist()
df_t['Index'] = df_t['id']
df_t[['Index','target']].to_csv("pkd.csv", header=['Index','target'], index=False)