In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


data = pd.read_csv("salaries.csv")

# (a) 
X = data[['company', 'job', 'degree']]  # Independent variables
Y = data['salary_more_then_100k']  # Target variable

# (b) 
label_encoder = LabelEncoder()
X['company'] = label_encoder.fit_transform(X['company'])
X['job'] = label_encoder.fit_transform(X['job'])
X['degree'] = label_encoder.fit_transform(X['degree'])

# (c) 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# (d) 
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

# (e) 
Y_pred = model.predict(X_test)

# (f) 
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy of the model:", accuracy)

# (g)
# i. Random state = 0
model_rs = DecisionTreeClassifier(random_state=0)
model_rs.fit(X_train, Y_train)
Y_pred_rs = model_rs.predict(X_test)
accuracy_rs = accuracy_score(Y_test, Y_pred_rs)
print("Accuracy of the model with random state = 0:", accuracy_rs)

# ii. Maximum depth = 3
model_md = DecisionTreeClassifier(max_depth=3)
model_md.fit(X_train, Y_train)
Y_pred_md = model_md.predict(X_test)
accuracy_md = accuracy_score(Y_test, Y_pred_md)
print("Accuracy of the model with maximum depth = 3:", accuracy_md)

# iii. Information criterion is entropy
model_entropy = DecisionTreeClassifier(criterion='entropy')
model_entropy.fit(X_train, Y_train)
Y_pred_entropy = model_entropy.predict(X_test)
accuracy_entropy = accuracy_score(Y_test, Y_pred_entropy)
print("Accuracy of the model with information criterion as entropy:", accuracy_entropy)

# (h) 

employee_data = {'company': ['google'], 'job': ['business manager'], 'degree': ['bachelors']}
employee_df = pd.DataFrame(employee_data)


employee_df['company'] = label_encoder.transform(employee_df['company'])
employee_df['job'] = label_encoder.transform(employee_df['job'])
employee_df['degree'] = label_encoder.transform(employee_df['degree'])


prediction = model.predict(employee_df)


if prediction[0] == 1:
    print("The predicted salary for the given employee is more than 100k.")
else:
    print("The predicted salary for the given employee is not more than 100k.")





Accuracy of the model: 0.75
Accuracy of the model with random state = 0: 0.75
Accuracy of the model with maximum depth = 3: 0.75
Accuracy of the model with information criterion as entropy: 0.75


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['company'] = label_encoder.fit_transform(X['company'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['job'] = label_encoder.fit_transform(X['job'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['degree'] = label_encoder.fit_transform(X['degree'])
