In [1]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import tree
from sklearn.impute import KNNImputer
from imblearn.combine import SMOTEENN

In [2]:
df = pd.read_csv("salaries.csv")
df.head()

Unnamed: 0,company,job,degree,salary_more_then_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [3]:
df['salary_more_then_100k'].value_counts()

1    10
0     6
Name: salary_more_then_100k, dtype: int64

In [None]:
df.isna().sum()

company                  0
job                      0
degree                   0
salary_more_then_100k    0
dtype: int64

In [5]:
df.company.value_counts()

google        6
facebook      6
abc pharma    4
Name: company, dtype: int64

In [6]:
df.job.value_counts()

business manager       6
sales executive        5
computer programmer    5
Name: job, dtype: int64

In [7]:
df.degree.value_counts()

bachelors    8
masters      8
Name: degree, dtype: int64

In [8]:
X = df.drop('salary_more_then_100k',axis='columns')
y = df['salary_more_then_100k']

In [9]:
# Label encoding of categorical values
encoder_company = LabelEncoder()
encoder_job = LabelEncoder()
encoder_degree = LabelEncoder()
X['company_encoded'] = encoder_company.fit_transform(X['company'])
X['company_job '] = encoder_job.fit_transform(X['job'])
X['company_degree'] = encoder_degree.fit_transform(X['degree'])

In [10]:
X = X.drop(['company','job','degree'],axis='columns')
X.head()

Unnamed: 0,company_encoded,company_job,company_degree
0,2,2,0
1,2,2,1
2,2,0,0
3,2,0,1
4,2,1,0


In [11]:
model = tree.DecisionTreeClassifier()
model.fit(X,y)
model.score(X,y)

1.0

In [12]:
# Is salary of Google, Computer Engineer, Bachelors degree > 100 k ?
model.predict([[2,1,0]])



array([0], dtype=int64)

In [13]:
# Is salary of Google, Computer Engineer, Masters degree > 100 k ?
model.predict([[2,1,1]])



array([1], dtype=int64)

Titanic dataset

In [None]:
df = pd.read_csv('titanic.csv')
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
df.Survived.value_counts(normalize=True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [17]:
df = df[['Pclass','Age','Sex','Fare','Survived']]

In [18]:
df.describe()

Unnamed: 0,Pclass,Age,Fare,Survived
count,891.0,714.0,891.0,891.0
mean,2.308642,29.699118,32.204208,0.383838
std,0.836071,14.526497,49.693429,0.486592
min,1.0,0.42,0.0,0.0
25%,2.0,20.125,7.9104,0.0
50%,3.0,28.0,14.4542,0.0
75%,3.0,38.0,31.0,1.0
max,3.0,80.0,512.3292,1.0


In [19]:
# handle missing values in age column
imputer = KNNImputer(n_neighbors=5)
df[['Age']] = imputer.fit_transform(df[['Age']])

In [20]:
# one hot encoding of categorical values
ohe = OneHotEncoder(drop='first')
categorical_columns = ['Sex']
one_hot_encoded = ohe.fit_transform(df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded.toarray(), columns=ohe.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([one_hot_df, df.drop(categorical_columns,axis=1)],axis=1)

In [21]:
X= df_encoded.drop('Survived', axis=1)
y= df_encoded[['Survived']]

In [22]:
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)



In [23]:
print(X.shape, X_resampled.shape)

(891, 4) (534, 4)


In [24]:
model = tree.DecisionTreeClassifier()
model.fit(X,y)
model.score(X,y)

0.9797979797979798

In [26]:
model_resmapled = tree.DecisionTreeClassifier()
model_resmapled.fit(X_resampled,y_resampled)
model.score(X_resampled,y_resampled)

1.0