In [2]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize


In [3]:
df= pd.read_csv(r"C:\Users\adishree.b\Downloads\is_promoted.csv")
df.head(2)

Unnamed: 0,employee_id,department,education,gender,no_of_trainings,age,previous_year_rating,length_of_service,is_promoted
0,65438,Sales & Marketing,Master's & above,f,1,35,5.0,8,0
1,65141,Operations,Bachelor's,m,1,30,5.0,4,0


In [4]:
df.drop(['employee_id'],axis=1,inplace=True)

In [5]:
df.dtypes

department               object
education                object
gender                   object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
is_promoted               int64
dtype: object

In [6]:
df.isnull().sum()

department                 0
education               2409
gender                     0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
is_promoted                0
dtype: int64

In [7]:
df.shape

(54808, 8)

In [8]:
df['previous_year_rating'].fillna(df['previous_year_rating'].mean(),inplace=True)

In [9]:
df.isnull().sum()

department                 0
education               2409
gender                     0
no_of_trainings            0
age                        0
previous_year_rating       0
length_of_service          0
is_promoted                0
dtype: int64

In [10]:
df.dropna(inplace=True)

In [11]:
df.shape

(52399, 8)

In [12]:
df['education'].value_counts()

Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: education, dtype: int64

In [13]:
variables = df[['department','education','gender']]
df_dum = pd.get_dummies(variables, drop_first=False)

In [14]:
df_dum.columns

Index(['department_Analytics', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_R&D', 'department_Sales & Marketing',
       'department_Technology', 'education_Bachelor's',
       'education_Below Secondary', 'education_Master's & above', 'gender_f',
       'gender_m'],
      dtype='object')

In [15]:
df_dum.head()

Unnamed: 0,department_Analytics,department_Finance,department_HR,department_Legal,department_Operations,department_Procurement,department_R&D,department_Sales & Marketing,department_Technology,education_Bachelor's,education_Below Secondary,education_Master's & above,gender_f,gender_m
0,0,0,0,0,0,0,0,1,0,0,0,1,1,0
1,0,0,0,0,1,0,0,0,0,1,0,0,0,1
2,0,0,0,0,0,0,0,1,0,1,0,0,0,1
3,0,0,0,0,0,0,0,1,0,1,0,0,0,1
4,0,0,0,0,0,0,0,0,1,1,0,0,0,1


In [16]:
data = df.join(df_dum)

# can do the same by pd.merge(df,df_dum,left_index=True, right_index=True)

In [17]:
data.columns

Index(['department', 'education', 'gender', 'no_of_trainings', 'age',
       'previous_year_rating', 'length_of_service', 'is_promoted',
       'department_Analytics', 'department_Finance', 'department_HR',
       'department_Legal', 'department_Operations', 'department_Procurement',
       'department_R&D', 'department_Sales & Marketing',
       'department_Technology', 'education_Bachelor's',
       'education_Below Secondary', 'education_Master's & above', 'gender_f',
       'gender_m'],
      dtype='object')

In [18]:
data.drop(variables,axis=1,inplace=True)

In [19]:
data = data.astype(float)

In [30]:
data.columns

Index(['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'is_promoted', 'department_Analytics', 'department_Finance',
       'department_HR', 'department_Legal', 'department_Operations',
       'department_Procurement', 'department_R&D',
       'department_Sales & Marketing', 'department_Technology',
       'education_Bachelor's', 'education_Below Secondary',
       'education_Master's & above', 'gender_f', 'gender_m'],
      dtype='object')

In [23]:
melb_target.value_counts()

0.0    47853
1.0     4546
Name: is_promoted, dtype: int64

In [24]:
# matplotlib and mpl_toolkits are not necessary. We employ them for the sole purpose of visualizing the results.  
import matplotlib.pyplot as plt
import tensorflow as tf

In [32]:
#data['age']=data['age']/data['age'].max()
#data['no_of_trainings']=data['no_of_trainings']/data['no_of_trainings'].max()
#data['previous_year_rating']=data['previous_year_rating']/data['previous_year_rating'].max()

from sklearn import preprocessing

x = data[['no_of_trainings', 'age', 'previous_year_rating']] #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_x = pd.DataFrame(x_scaled)

In [37]:
data[['no_of_trainings', 'age', 'previous_year_rating']]=df_x

In [40]:
generated_targets = data['is_promoted']

generated_inputs = data.drop(['is_promoted'], axis=1)

# save into an npz file called "TF_intro"
np.savez('TF_intro', inputs=generated_inputs, targets=generated_targets)

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(generated_inputs,generated_targets,
                                                    train_size=0.7, 
                                                    test_size=0.3, 
                                                    random_state=2)

print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

X_train : (36679, 18)
X_test : (15720, 18)
y_train : (36679,)
y_test : (15720,)


In [42]:
input_size = 18
output_size = 2
# Use same hidden layer size for both hidden layers. Not a necessity.
hidden_layer_size = 10
    
# define how the model will look like
model = tf.keras.Sequential([
    
    # tf.keras.layers.Dense is basically implementing: output = activation(dot(input, weight) + bias)
    # it takes several arguments, but the most important ones for us are the hidden_layer_size and the activation function
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='tanh'), # 2nd hidden layer
    
    # the final layer is no different, we just make sure to activate it with softmax
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])

In [43]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [45]:
# determine the maximum number of epochs
NUM_EPOCHS = 10

# we fit the model, specifying the
# training data
# the total number of epochs
# and the validation data we just created ourselves in the format: (inputs,targets)
model.fit(X_train, epochs=5, verbose=2)

ValueError: Please provide as model inputs either a single array or a list of arrays. You passed: x=       no_of_trainings    age  previous_year_rating  length_of_service  \
15259         0.000000  0.700                  0.50                1.0   
34280         0.000000  0.400                  0.75                7.0   
1675          0.000000  0.425                  0.75               28.0   
44803         0.111111  0.675                  0.75                3.0   
38883         0.111111  0.250                  0.00                1.0   
...                ...    ...                   ...                ...   
46629         0.000000  0.400                  0.50                5.0   
31726         0.000000  0.925                  0.75                7.0   
6975          0.222222  0.175                  0.75               17.0   
37010         0.000000  0.350                  0.75                8.0   
24845         0.111111  0.350                  1.00                8.0   

       department_Analytics  department_Finance  department_HR  \
15259                   0.0                 0.0            0.0   
34280                   0.0                 0.0            0.0   
1675                    0.0                 0.0            0.0   
44803                   0.0                 0.0            0.0   
38883                   0.0                 0.0            1.0   
...                     ...                 ...            ...   
46629                   0.0                 0.0            0.0   
31726                   0.0                 0.0            0.0   
6975                    0.0                 0.0            0.0   
37010                   0.0                 0.0            0.0   
24845                   0.0                 0.0            0.0   

       department_Legal  department_Operations  department_Procurement  \
15259               0.0                    0.0                     0.0   
34280               0.0                    1.0                     0.0   
1675                0.0                    0.0                     0.0   
44803               0.0                    1.0                     0.0   
38883               0.0                    0.0                     0.0   
...                 ...                    ...                     ...   
46629               0.0                    1.0                     0.0   
31726               0.0                    0.0                     0.0   
6975                0.0                    0.0                     1.0   
37010               0.0                    1.0                     0.0   
24845               0.0                    0.0                     0.0   

       department_R&D  department_Sales & Marketing  department_Technology  \
15259             0.0                           0.0                    1.0   
34280             0.0                           0.0                    0.0   
1675              0.0                           1.0                    0.0   
44803             0.0                           0.0                    0.0   
38883             0.0                           0.0                    0.0   
...               ...                           ...                    ...   
46629             0.0                           0.0                    0.0   
31726             0.0                           0.0                    1.0   
6975              0.0                           0.0                    0.0   
37010             0.0                           0.0                    0.0   
24845             0.0                           1.0                    0.0   

       education_Bachelor's  education_Below Secondary  \
15259                   1.0                        0.0   
34280                   0.0                        0.0   
1675                    0.0                        0.0   
44803                   0.0                        0.0   
38883                   0.0                        1.0   
...                     ...                        ...   
46629                   0.0                        0.0   
31726                   1.0                        0.0   
6975                    1.0                        0.0   
37010                   0.0                        0.0   
24845                   1.0                        0.0   

       education_Master's & above  gender_f  gender_m  
15259                         0.0       0.0       1.0  
34280                         1.0       0.0       1.0  
1675                          1.0       0.0       1.0  
44803                         1.0       0.0       1.0  
38883                         0.0       0.0       1.0  
...                           ...       ...       ...  
46629                         1.0       0.0       1.0  
31726                         0.0       0.0       1.0  
6975                          0.0       0.0       1.0  
37010                         1.0       0.0       1.0  
24845                         0.0       0.0       1.0  

[36679 rows x 18 columns]