# Decision Tree Algorithm:

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

### Importing data:

In [2]:
df=pd.read_csv('salaries.csv')
df

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0
5,google,computer programmer,masters,1
6,abc pharma,sales executive,masters,0
7,abc pharma,computer programmer,bachelors,0
8,abc pharma,business manager,bachelors,0
9,abc pharma,business manager,masters,1


## Predicting whether an employee's salary is more than 100K or not:

### Preparing data:

In [3]:
targets=df['salary_more_then_100k']
targets

0     0
1     0
2     1
3     1
4     0
5     1
6     0
7     0
8     0
9     1
10    1
11    1
12    1
13    1
14    1
15    1
Name: salary_more_then_100k, dtype: int64

In [4]:
inputs=df.drop(['salary_more_then_100k'],axis=1)
inputs

Unnamed: 0,company,job,degree
0,google,sales executive,bachelors
1,google,sales executive,masters
2,google,business manager,bachelors
3,google,business manager,masters
4,google,computer programmer,bachelors
5,google,computer programmer,masters
6,abc pharma,sales executive,masters
7,abc pharma,computer programmer,bachelors
8,abc pharma,business manager,bachelors
9,abc pharma,business manager,masters


### Applying Label Encoder for all the non-numeric attributes:

In [5]:
le_c=LabelEncoder()
le_j=LabelEncoder()
le_d=LabelEncoder()

In [6]:
inputs['company_n']=le_c.fit_transform(inputs['company'])
inputs['job_n']=le_j.fit_transform(inputs['job'])
inputs['degree_n']=le_d.fit_transform(inputs['degree'])

In [7]:
inputs

Unnamed: 0,company,job,degree,company_n,job_n,degree_n
0,google,sales executive,bachelors,2,2,0
1,google,sales executive,masters,2,2,1
2,google,business manager,bachelors,2,0,0
3,google,business manager,masters,2,0,1
4,google,computer programmer,bachelors,2,1,0
5,google,computer programmer,masters,2,1,1
6,abc pharma,sales executive,masters,0,2,1
7,abc pharma,computer programmer,bachelors,0,1,0
8,abc pharma,business manager,bachelors,0,0,0
9,abc pharma,business manager,masters,0,0,1


In [8]:
inputs_n=inputs.drop(['company','job','degree'],axis=1)
inputs_n

Unnamed: 0,company_n,job_n,degree_n
0,2,2,0
1,2,2,1
2,2,0,0
3,2,0,1
4,2,1,0
5,2,1,1
6,0,2,1
7,0,1,0
8,0,0,0
9,0,0,1


### Building reference dictionaries for each non-numeric attribute:

In [9]:
ref_c=dict()
for i in range(len(inputs)):
    x=inputs.iloc[i]
    x1=x['company_n']
    if x1 not in ref_c.keys():
        ref_c[x1]=x['company']
ref_c

{2: 'google', 0: 'abc pharma', 1: 'facebook'}

In [10]:
ref_j=dict()
for i in range(len(inputs)):
    x=inputs.iloc[i]
    x1=x['job_n']
    if x1 not in ref_j.keys():
        ref_j[x1]=x['job']
ref_j

{2: 'sales executive', 0: 'business manager', 1: 'computer programmer'}

In [11]:
ref_d=dict()
for i in range(len(inputs)):
    x=inputs.iloc[i]
    x1=x['degree_n']
    if x1 not in ref_d.keys():
        ref_d[x1]=x['degree']
ref_d

{0: 'bachelors', 1: 'masters'}

## Building Prediction Model:

In [12]:
model=DecisionTreeClassifier()

In [13]:
model.fit(inputs_n,targets)

DecisionTreeClassifier()

In [14]:
model.score(inputs_n,targets)

1.0

In [15]:
model.predict([[2,0,1],[1,0,0]])



array([1, 1], dtype=int64)

In [16]:
model.predict([[0,0,0]])



array([0], dtype=int64)