In [378]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [379]:
data = pd.read_csv('cmc.data')

In [380]:
data.head()

Unnamed: 0,24,2,3,3.1,1,1.1,2.1,3.2,0,1.2
0,45,1,3,10,1,1,3,4,0,1
1,43,2,3,7,1,1,3,4,0,1
2,42,3,2,9,1,1,3,3,0,1
3,36,3,3,8,1,1,3,2,0,1
4,19,4,4,0,1,1,3,3,0,1


In [381]:
df = data 

In [382]:
df.columns=['age','wife_education','husband_education','no_children',
            'religion', 'working', 'husband_occupation','std_index','media_exposure',
           'contraceptive_method']

In [383]:
df.head()

Unnamed: 0,age,wife_education,husband_education,no_children,religion,working,husband_occupation,std_index,media_exposure,contraceptive_method
0,45,1,3,10,1,1,3,4,0,1
1,43,2,3,7,1,1,3,4,0,1
2,42,3,2,9,1,1,3,3,0,1
3,36,3,3,8,1,1,3,2,0,1
4,19,4,4,0,1,1,3,3,0,1


In [384]:
df.dtypes

age                     int64
wife_education          int64
husband_education       int64
no_children             int64
religion                int64
working                 int64
husband_occupation      int64
std_index               int64
media_exposure          int64
contraceptive_method    int64
dtype: object

In [385]:
wife_education_mapping ={1:"low", 2:"medium-low", 3:"medium-high", 4:"high"}
husband_education_mapping = {1:"low", 2:"medium-low", 3:"medium-high", 4:"high"}
religion_mapping ={0:"Non-Islam", 1:"Islam"}
working_mapping={0:"Yes", 1:"No"}
husband_occupation_mapping ={1:"low", 2:"medium-low", 3:"medium-high", 4:"high"}
std_index_mapping ={1:"low", 2:"medium-low", 3:"medium-high", 4:"high"}
media_exposure_mapping = {0:"Good", 1:"Not good"}
#contraceptive_method_mapping ={1:"No use", 2:"Long term", 3:"Short term"}

In [386]:
df['wife_education_label'] = df['wife_education'].map(wife_education_mapping)
df['husband_education_label'] = df['husband_education'].map(husband_education_mapping)
df['religion_label'] = df['religion'].map(religion_mapping)
df['working_label'] = df['working'].map(working_mapping)
df['husband_occupation_label'] = df['husband_occupation'].map(husband_occupation_mapping)
df['std_index_label'] = df['std_index'].map(std_index_mapping)
df['media_exposure_label'] = df['media_exposure'].map(media_exposure_mapping)
#df['contraceptive_method_label'] = df['contraceptive_method'].map(contraceptive_method_mapping)


In [387]:
df.head()

Unnamed: 0,age,wife_education,husband_education,no_children,religion,working,husband_occupation,std_index,media_exposure,contraceptive_method,wife_education_label,husband_education_label,religion_label,working_label,husband_occupation_label,std_index_label,media_exposure_label
0,45,1,3,10,1,1,3,4,0,1,low,medium-high,Islam,No,medium-high,high,Good
1,43,2,3,7,1,1,3,4,0,1,medium-low,medium-high,Islam,No,medium-high,high,Good
2,42,3,2,9,1,1,3,3,0,1,medium-high,medium-low,Islam,No,medium-high,medium-high,Good
3,36,3,3,8,1,1,3,2,0,1,medium-high,medium-high,Islam,No,medium-high,medium-low,Good
4,19,4,4,0,1,1,3,3,0,1,high,high,Islam,No,medium-high,medium-high,Good


In [388]:
new_df = df[['age','wife_education_label','husband_education_label','no_children','religion_label','working_label',
             'husband_occupation_label','std_index_label','media_exposure_label','contraceptive_method']]

In [389]:
new_df['contraceptive_method_label']= new_df['contraceptive_method'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['contraceptive_method_label']= new_df['contraceptive_method'].astype('category')


In [390]:
new_df.head()

Unnamed: 0,age,wife_education_label,husband_education_label,no_children,religion_label,working_label,husband_occupation_label,std_index_label,media_exposure_label,contraceptive_method,contraceptive_method_label
0,45,low,medium-high,10,Islam,No,medium-high,high,Good,1,1
1,43,medium-low,medium-high,7,Islam,No,medium-high,high,Good,1,1
2,42,medium-high,medium-low,9,Islam,No,medium-high,medium-high,Good,1,1
3,36,medium-high,medium-high,8,Islam,No,medium-high,medium-low,Good,1,1
4,19,high,high,0,Islam,No,medium-high,medium-high,Good,1,1


In [391]:
new_df.dtypes

age                              int64
wife_education_label            object
husband_education_label         object
no_children                      int64
religion_label                  object
working_label                   object
husband_occupation_label        object
std_index_label                 object
media_exposure_label            object
contraceptive_method             int64
contraceptive_method_label    category
dtype: object

In [392]:
new_df = new_df.drop('contraceptive_method', axis=1)

In [393]:
new_df.dtypes

age                              int64
wife_education_label            object
husband_education_label         object
no_children                      int64
religion_label                  object
working_label                   object
husband_occupation_label        object
std_index_label                 object
media_exposure_label            object
contraceptive_method_label    category
dtype: object

In [394]:
new_df['wife_education_label'].unique()

array(['low', 'medium-low', 'medium-high', 'high'], dtype=object)

In [395]:
##OneHotEncode new_df
transformer = make_column_transformer((OneHotEncoder(),['wife_education_label','husband_education_label','religion_label','working_label',
                      'husband_occupation_label','std_index_label','media_exposure_label']), remainder = 'passthrough')

In [396]:
transformed = transformer.fit_transform(new_df)

In [397]:
transformed_new_df = pd.DataFrame(transformed, columns = transformer.get_feature_names())

In [398]:
print(transformed_new_df.head())

   onehotencoder__x0_high  onehotencoder__x0_low  \
0                     0.0                    1.0   
1                     0.0                    0.0   
2                     0.0                    0.0   
3                     0.0                    0.0   
4                     1.0                    0.0   

   onehotencoder__x0_medium-high  onehotencoder__x0_medium-low  \
0                            0.0                           0.0   
1                            0.0                           1.0   
2                            1.0                           0.0   
3                            1.0                           0.0   
4                            0.0                           0.0   

   onehotencoder__x1_high  onehotencoder__x1_low  \
0                     0.0                    0.0   
1                     0.0                    0.0   
2                     0.0                    0.0   
3                     0.0                    0.0   
4                     1.0     

In [399]:
transformed_new_df.dtypes

onehotencoder__x0_high           float64
onehotencoder__x0_low            float64
onehotencoder__x0_medium-high    float64
onehotencoder__x0_medium-low     float64
onehotencoder__x1_high           float64
onehotencoder__x1_low            float64
onehotencoder__x1_medium-high    float64
onehotencoder__x1_medium-low     float64
onehotencoder__x2_Islam          float64
onehotencoder__x2_Non-Islam      float64
onehotencoder__x3_No             float64
onehotencoder__x3_Yes            float64
onehotencoder__x4_high           float64
onehotencoder__x4_low            float64
onehotencoder__x4_medium-high    float64
onehotencoder__x4_medium-low     float64
onehotencoder__x5_high           float64
onehotencoder__x5_low            float64
onehotencoder__x5_medium-high    float64
onehotencoder__x5_medium-low     float64
onehotencoder__x6_Good           float64
onehotencoder__x6_Not good       float64
age                              float64
no_children                      float64
contraceptive_me

In [400]:
#transformed_new_df['contraceptive_method_label2']= transformed_new_df['contraceptive_method_label'].astype('category')
transformed_new_df[['age_2','no_children_2']]= transformed_new_df[['age','no_children']].astype('int')

In [401]:
transformed_new_df.dtypes

onehotencoder__x0_high           float64
onehotencoder__x0_low            float64
onehotencoder__x0_medium-high    float64
onehotencoder__x0_medium-low     float64
onehotencoder__x1_high           float64
onehotencoder__x1_low            float64
onehotencoder__x1_medium-high    float64
onehotencoder__x1_medium-low     float64
onehotencoder__x2_Islam          float64
onehotencoder__x2_Non-Islam      float64
onehotencoder__x3_No             float64
onehotencoder__x3_Yes            float64
onehotencoder__x4_high           float64
onehotencoder__x4_low            float64
onehotencoder__x4_medium-high    float64
onehotencoder__x4_medium-low     float64
onehotencoder__x5_high           float64
onehotencoder__x5_low            float64
onehotencoder__x5_medium-high    float64
onehotencoder__x5_medium-low     float64
onehotencoder__x6_Good           float64
onehotencoder__x6_Not good       float64
age                              float64
no_children                      float64
contraceptive_me

In [402]:
transformed_new_df['contraceptive_method_label2']= transformed_new_df['contraceptive_method_label'].astype('category')

In [403]:
transformed_new_df = transformed_new_df.drop(['age','no_children','contraceptive_method_label'], axis=1)

In [404]:
transformed_new_df.dtypes

onehotencoder__x0_high            float64
onehotencoder__x0_low             float64
onehotencoder__x0_medium-high     float64
onehotencoder__x0_medium-low      float64
onehotencoder__x1_high            float64
onehotencoder__x1_low             float64
onehotencoder__x1_medium-high     float64
onehotencoder__x1_medium-low      float64
onehotencoder__x2_Islam           float64
onehotencoder__x2_Non-Islam       float64
onehotencoder__x3_No              float64
onehotencoder__x3_Yes             float64
onehotencoder__x4_high            float64
onehotencoder__x4_low             float64
onehotencoder__x4_medium-high     float64
onehotencoder__x4_medium-low      float64
onehotencoder__x5_high            float64
onehotencoder__x5_low             float64
onehotencoder__x5_medium-high     float64
onehotencoder__x5_medium-low      float64
onehotencoder__x6_Good            float64
onehotencoder__x6_Not good        float64
age_2                               int32
no_children_2                     

In [405]:
transformed_new_df.head()

Unnamed: 0,onehotencoder__x0_high,onehotencoder__x0_low,onehotencoder__x0_medium-high,onehotencoder__x0_medium-low,onehotencoder__x1_high,onehotencoder__x1_low,onehotencoder__x1_medium-high,onehotencoder__x1_medium-low,onehotencoder__x2_Islam,onehotencoder__x2_Non-Islam,...,onehotencoder__x4_medium-low,onehotencoder__x5_high,onehotencoder__x5_low,onehotencoder__x5_medium-high,onehotencoder__x5_medium-low,onehotencoder__x6_Good,onehotencoder__x6_Not good,age_2,no_children_2,contraceptive_method_label2
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,45,10,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,43,7,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,42,9,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,36,8,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,19,0,1.0


In [406]:
inputs = transformed_new_df.drop('contraceptive_method_label2', axis='columns')
target = transformed_new_df['contraceptive_method_label2']
inputs.head(5)

Unnamed: 0,onehotencoder__x0_high,onehotencoder__x0_low,onehotencoder__x0_medium-high,onehotencoder__x0_medium-low,onehotencoder__x1_high,onehotencoder__x1_low,onehotencoder__x1_medium-high,onehotencoder__x1_medium-low,onehotencoder__x2_Islam,onehotencoder__x2_Non-Islam,...,onehotencoder__x4_medium-high,onehotencoder__x4_medium-low,onehotencoder__x5_high,onehotencoder__x5_low,onehotencoder__x5_medium-high,onehotencoder__x5_medium-low,onehotencoder__x6_Good,onehotencoder__x6_Not good,age_2,no_children_2
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,45,10
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,43,7
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,42,9
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,36,8
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,19,0


In [407]:
from sklearn.model_selection import train_test_split
X = inputs
y = target
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)

In [408]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [409]:
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [410]:
predictions = clf.predict(X_test)
predictions

array([1., 1., 2., 2., 3., 2., 1., 2., 3., 3., 1., 1., 1., 2., 1., 2., 1.,
       3., 2., 2., 1., 1., 3., 3., 2., 1., 1., 1., 1., 1., 2., 2., 1., 3.,
       2., 3., 2., 1., 2., 3., 2., 1., 3., 1., 2., 1., 3., 1., 1., 3., 2.,
       1., 3., 2., 1., 1., 3., 1., 3., 2., 1., 3., 3., 3., 2., 1., 2., 3.,
       1., 1., 2., 1., 1., 1., 2., 3., 1., 1., 3., 3., 1., 1., 3., 1., 1.,
       3., 3., 1., 1., 3., 2., 1., 2., 2., 3., 1., 3., 1., 2., 2., 1., 3.,
       3., 1., 3., 2., 3., 1., 1., 2., 2., 1., 3., 3., 1., 1., 3., 1., 1.,
       3., 1., 2., 1., 2., 3., 1., 1., 2., 3., 3., 1., 1., 2., 1., 3., 1.,
       1., 1., 3., 3., 3., 3., 1., 3., 3., 3., 1., 3., 2., 1., 2., 1., 3.,
       3., 2., 2., 3., 2., 1., 2., 1., 1., 2., 3., 3., 3., 2., 1., 1., 3.,
       1., 3., 3., 1., 3., 1., 3., 1., 1., 2., 1., 1., 2., 1., 1., 3., 3.,
       3., 1., 2., 3., 2., 2., 3., 3., 3., 3., 1., 1., 3., 1., 2., 1., 2.,
       3., 2., 2., 1., 1., 2., 1., 1., 3., 1., 2., 1., 3., 1., 1., 2., 2.,
       1., 1., 3., 3., 1.

In [411]:
clf.predict_proba(X_test)

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [412]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.4835390946502058

In [413]:
clf.score(inputs,target)

0.8118206521739131

In [414]:
new_df.head(5)

Unnamed: 0,age,wife_education_label,husband_education_label,no_children,religion_label,working_label,husband_occupation_label,std_index_label,media_exposure_label,contraceptive_method_label
0,45,low,medium-high,10,Islam,No,medium-high,high,Good,1
1,43,medium-low,medium-high,7,Islam,No,medium-high,high,Good,1
2,42,medium-high,medium-low,9,Islam,No,medium-high,medium-high,Good,1
3,36,medium-high,medium-high,8,Islam,No,medium-high,medium-low,Good,1
4,19,high,high,0,Islam,No,medium-high,medium-high,Good,1


In [415]:
inputs_2 = new_df.drop('contraceptive_method_label', axis='columns')
target_2 = new_df['contraceptive_method_label']

In [416]:
from sklearn.preprocessing import LabelEncoder

In [417]:
wife_education_label1 = LabelEncoder()
husband_education_label1 = LabelEncoder()
religion_label1 = LabelEncoder()
working_label1 = LabelEncoder()
husband_occupation_label1 = LabelEncoder()
std_index_label1 = LabelEncoder()
media_exposure_label1 = LabelEncoder()

In [418]:
inputs_2['wife_education_label1']= wife_education_label1.fit_transform(inputs_2['wife_education_label'])
inputs_2['husband_education_label1']= husband_education_label1.fit_transform(inputs_2['husband_education_label'])
inputs_2['religion_label1']= religion_label1.fit_transform(inputs_2['religion_label'])
inputs_2['working_label1']= working_label1.fit_transform(inputs_2['working_label'])
inputs_2['husband_occupation_label1']= husband_occupation_label1.fit_transform(inputs_2['husband_occupation_label'])
inputs_2['std_index_label1']=std_index_label1.fit_transform(inputs_2['std_index_label'])
inputs_2['media_exposure_label1']= media_exposure_label1.fit_transform(inputs_2['media_exposure_label'])
inputs_2.head(5)

Unnamed: 0,age,wife_education_label,husband_education_label,no_children,religion_label,working_label,husband_occupation_label,std_index_label,media_exposure_label,wife_education_label1,husband_education_label1,religion_label1,working_label1,husband_occupation_label1,std_index_label1,media_exposure_label1
0,45,low,medium-high,10,Islam,No,medium-high,high,Good,1,2,0,0,2,0,0
1,43,medium-low,medium-high,7,Islam,No,medium-high,high,Good,3,2,0,0,2,0,0
2,42,medium-high,medium-low,9,Islam,No,medium-high,medium-high,Good,2,3,0,0,2,2,0
3,36,medium-high,medium-high,8,Islam,No,medium-high,medium-low,Good,2,2,0,0,2,3,0
4,19,high,high,0,Islam,No,medium-high,medium-high,Good,0,0,0,0,2,2,0


In [425]:
inputs_n = inputs_2.drop(['wife_education_label','husband_education_label','religion_label',
                         'working_label','husband_occupation_label','std_index_label',
                         'media_exposure_label'],axis='columns')
inputs_n.head(5)

Unnamed: 0,age,no_children,wife_education_label1,husband_education_label1,religion_label1,working_label1,husband_occupation_label1,std_index_label1,media_exposure_label1
0,45,10,1,2,0,0,2,0,0
1,43,7,3,2,0,0,2,0,0
2,42,9,2,3,0,0,2,2,0
3,36,8,2,2,0,0,2,3,0
4,19,0,0,0,0,0,2,2,0


In [426]:
inputs_n.dtypes

age                          int64
no_children                  int64
wife_education_label1        int32
husband_education_label1     int32
religion_label1              int32
working_label1               int32
husband_occupation_label1    int32
std_index_label1             int32
media_exposure_label1        int32
dtype: object

In [420]:
from sklearn.model_selection import train_test_split
X = inputs_n
y = target_2
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)

In [421]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [422]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.36213991769547327

In [423]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.36213991769547327

In [424]:
clf.score(inputs_n,target_2)

0.8009510869565217