In [23]:
import pandas as pd

df = pd.read_csv("salaries.csv")
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [24]:
df.shape

(16, 4)

In [25]:
# many text columns, to convert into numbers
# company, job are nominal variable   ------    one hot encoder
# degree is ordinal variable      ---------     Label encoder


## Feature Engineering : Label Encoding

In [27]:
df["degree"].unique()

array(['bachelors', 'masters'], dtype=object)

In [28]:
df['degree_number'] = df['degree'].map({'bachelors':1,'masters':2})
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k,degree_number
0,google,sales executive,bachelors,0,1
1,google,sales executive,masters,0,2
2,google,business manager,bachelors,1,1
3,google,business manager,masters,1,2
4,google,computer programmer,bachelors,0,1


In [29]:
df.drop('degree',axis=1,inplace=True)
df.head()

Unnamed: 0,company,job,salary_more_then_100k,degree_number
0,google,sales executive,0,1
1,google,sales executive,0,2
2,google,business manager,1,1
3,google,business manager,1,2
4,google,computer programmer,0,1


# Feature Engineering : One Hot Encoding

In [31]:
df["company"].unique()

array(['google', 'abc pharma', 'facebook'], dtype=object)

In [32]:
df["job"].unique()

array(['sales executive', 'business manager', 'computer programmer'],
      dtype=object)

In [33]:
df = pd.get_dummies(df, columns=["company","job"], drop_first = True)
df.head()

Unnamed: 0,salary_more_then_100k,degree_number,company_facebook,company_google,job_computer programmer,job_sales executive
0,0,1,False,True,False,True
1,0,2,False,True,False,True
2,1,1,False,True,False,False
3,1,2,False,True,False,False
4,0,1,False,True,True,False


In [None]:
# Decision Tree intuition is such that encoding doesn't matter much

# Model Training with Gini by default

In [47]:
X = df.drop("salary_more_then_100k",axis = 1)
y = df["salary_more_then_100k"]

# we will not do trainn_test_split as the rows are only 16
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X, y)

In [54]:
y_pred = model.predict(X)

# Model Evaluation of Gini

In [57]:
from sklearn.metrics import classification_report

cr = classification_report(y,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



# Decision Tree with Gini

In [60]:
from sklearn.tree import export_text

print(export_text(model, feature_names=list(X.columns)))

|--- company_facebook <= 0.50
|   |--- job_sales executive <= 0.50
|   |   |--- degree_number <= 1.50
|   |   |   |--- company_google <= 0.50
|   |   |   |   |--- class: 0
|   |   |   |--- company_google >  0.50
|   |   |   |   |--- job_computer programmer <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- job_computer programmer >  0.50
|   |   |   |   |   |--- class: 0
|   |   |--- degree_number >  1.50
|   |   |   |--- class: 1
|   |--- job_sales executive >  0.50
|   |   |--- class: 0
|--- company_facebook >  0.50
|   |--- class: 1



# Model Training with entropy

In [64]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion = 'entropy')
model.fit(X, y)

# Model Evaluation with Entropy

In [70]:
from sklearn.metrics import classification_report

cr = classification_report(y,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



# Decision Tree with Entropy

In [73]:
from sklearn.tree import export_text

print(export_text(model, feature_names=list(X.columns)))

|--- company_facebook <= 0.50
|   |--- job_sales executive <= 0.50
|   |   |--- degree_number <= 1.50
|   |   |   |--- company_google <= 0.50
|   |   |   |   |--- class: 0
|   |   |   |--- company_google >  0.50
|   |   |   |   |--- job_computer programmer <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- job_computer programmer >  0.50
|   |   |   |   |   |--- class: 0
|   |   |--- degree_number >  1.50
|   |   |   |--- class: 1
|   |--- job_sales executive >  0.50
|   |   |--- class: 0
|--- company_facebook >  0.50
|   |--- class: 1



# Model Prediction with unseen data

In [76]:
model.predict([
    [1, 1, 0, 1, 0],
    [1, 0, 1, 0, 1],
])



array([1, 0], dtype=int64)

In [None]:
# 1 means person has salary > 100 k, facebook executive
# 0 means person has salary < 100 k, sales executive