## **Import required Libraries** 

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## **Load the Data** 

In [26]:
df = pd.read_csv('mldata.csv')
# df.head()
print("hello")

hello


In [27]:
print('The shape of our training set: %s professionals and %s features'%(df.shape[0],df.shape[1]))

The shape of our training set: 6901 professionals and 20 features


## **Feature Engineering** 

### (1) Binary Encoding for Categorical Variables

In [28]:
cols = df[["self-learning capability?", "Extra-courses did","Taken inputs from seniors or elders", "worked in teams ever?", "Introvert"]]
for i in cols:
    cleanup_nums = {i: {"yes": 1, "no": 0}}
    df = df.replace(cleanup_nums)

In [29]:
print("\n\nList of Categorical features: \n" , df.select_dtypes(include=['object']).columns.tolist())



List of Categorical features: 
 ['certifications', 'workshops', 'reading and writing skills', 'memory capability score', 'Interested subjects', 'interested career area ', 'Type of company want to settle in?', 'Interested Type of Books', 'Management or Technical', 'hard/smart worker', 'Suggested Job Role']


### (2) Number Encoding for Categorical 

In [30]:
mycol = df[["reading and writing skills", "memory capability score"]]
for i in mycol:
    cleanup_nums = {i: {"poor": 0, "medium": 1, "excellent": 2}}
    df = df.replace(cleanup_nums)

category_cols = df[['certifications', 'workshops', 'Interested subjects', 'interested career area ', 'Type of company want to settle in?', 
                    'Interested Type of Books']]
dic={}
for i in category_cols:
    df[i] = df[i].astype('category')
    df[i + "_code"] = df[i].cat.codes
    dic[i]=dict(enumerate(df[i].cat.categories))
    # print(dic[i])

print("\n\nList of Categorical features: \n" , df.select_dtypes(include=['object']).columns.tolist())



List of Categorical features: 
 ['Management or Technical', 'hard/smart worker', 'Suggested Job Role']


### (3) Dummy Variable Encoding

In [31]:
print(df['Management or Technical'].unique())
print(df['hard/smart worker'].unique())

['Management' 'Technical']
['smart worker' 'hard worker']


In [34]:
df = pd.get_dummies(df, columns=["Management or Technical", "hard/smart worker"], prefix=["A", "B"])
df.head()

Unnamed: 0,Logical quotient rating,hackathons,coding skills rating,public speaking points,self-learning capability?,Extra-courses did,certifications,workshops,reading and writing skills,memory capability score,...,certifications_code,workshops_code,Interested subjects_code,interested career area _code,Type of company want to settle in?_code,Interested Type of Books_code,A_Management,A_Technical,B_hard worker,B_smart worker
0,5,0,6,2,1,0,information security,testing,0,0,...,4,6,9,5,0,28,1,0,0,1
1,7,6,4,3,0,1,shell programming,testing,2,1,...,8,6,2,4,1,3,0,1,1,0
2,2,3,9,1,0,1,information security,testing,2,0,...,4,6,5,0,9,29,0,1,0,1
3,2,6,3,5,0,1,r programming,database security,2,0,...,7,2,7,5,7,13,1,0,0,1
4,2,0,3,4,1,0,distro making,game development,2,1,...,1,3,3,4,0,14,0,1,1,0


In [32]:
print("List of Numerical features: \n" , df.select_dtypes(include=np.number).columns.tolist())

List of Numerical features: 
 ['Logical quotient rating', 'hackathons', 'coding skills rating', 'public speaking points', 'self-learning capability?', 'Extra-courses did', 'reading and writing skills', 'memory capability score', 'Taken inputs from seniors or elders', 'worked in teams ever?', 'Introvert', 'certifications_code', 'workshops_code', 'Interested subjects_code', 'interested career area _code', 'Type of company want to settle in?_code', 'Interested Type of Books_code']


## **Building Machine Learning Model**

In [35]:
feed = df[['Logical quotient rating', 'coding skills rating', 'hackathons', 'public speaking points', 'self-learning capability?','Extra-courses did', 
           'Taken inputs from seniors or elders', 'worked in teams ever?', 'Introvert', 'reading and writing skills', 'memory capability score',  
           'B_hard worker', 'B_smart worker', 'A_Management', 'A_Technical', 'Interested subjects_code', 'Interested Type of Books_code', 'certifications_code', 
           'workshops_code', 'Type of company want to settle in?_code',  'interested career area _code',
             'Suggested Job Role']]

# Taking all independent variable columns
df_train_x = feed.drop('Suggested Job Role',axis = 1)

# Target variable column
df_train_y = feed['Suggested Job Role']

x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.20, random_state=42)


### **(1) Decision Tree Classifier**

In [49]:
dtree = DecisionTreeClassifier(random_state=1)
dtree = dtree.fit(x_train, y_train)

y_pred = dtree.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
print("confusion matrics=",cm)
print("  ")
print("accuracy=",accuracy*10)

confusion matrics= [[11 11 11  8 13 15 12 10  7 10 14 10]
 [10  4  6  8  5  7 11 13 17  7  8  9]
 [13  9 11  8 12 10 11 12  5 10  8 14]
 [ 5 10 12  7  5  8  6 10  4  7 14 12]
 [ 8 11  9 12 14 10  6  8  7  9 16 11]
 [12 13  8  9 13 12  4  7 11 10 11  3]
 [ 8 15 12  9  9  7 15  7  7 11 11  5]
 [10  9  7  3 14 11 18 11 15  8  6  6]
 [11  7 10 13  7 12 12  8  7  6  7 14]
 [ 9 10 10  8 18  8  7  6  9  9 11  6]
 [ 7 14  5 13  9 10 11  9  9 10 12 10]
 [ 9 10  9 15 10  9  7  5 14  7  6  8]]
  
accuracy= 0.8761766835626359


### **Predicting class for one instance**

In [57]:
userdata = [['7','6','6','8','3','5','4', '4', '7', '3', '3', '6','8', 
                    '7','5','7','4','0','1','0','1']]
ynewclass = dtree.predict(userdata)
ynew = dtree.predict_proba(userdata)
print(ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

['Web Developer']
Probabilities of all classes:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Probability of Predicted class :  1.0




### **(2) Supprt Vector Machine Classifier**

In [37]:
svm = svm.SVC()
svm.fit(x_train, y_train)
svm_y_pred = svm.predict(x_test)
svm_cm = confusion_matrix(y_test,svm_y_pred)
svm_accuracy = accuracy_score(y_test,svm_y_pred)
print("confusion matrics=",svm_cm)
print("  ")
print("accuracy=",svm_accuracy*100)

confusion matrics= [[ 1  5  0  6 41 20 18  6 15 12  6  2]
 [ 1  7  0  3 44  5 15 13  3  9  3  2]
 [ 0  5  2  5 44 14 19  9  6 10  6  3]
 [ 2  7  1  3 41 13  9  6  3 10  2  3]
 [ 2  7  3  4 40 17 18  9  4 13  1  3]
 [ 1 10  0  4 44 14 12 13  4  8  2  1]
 [ 1  6  3  2 40  7 13 17 12  8  4  3]
 [ 0  7  1  6 37 12 16 13 11 11  4  0]
 [ 0  9  0  3 37 13 14 13  6 12  2  5]
 [ 0  8  0  3 43 13 15  8  5 11  3  2]
 [ 1  8  1  3 40 16 18 12  4 10  4  2]
 [ 2  9  6  2 33 13 10 12 10  9  2  1]]
  
accuracy= 8.32729905865315


In [34]:
ynewclass = svm.predict(userdata)
ynew = svm.decision_function(userdata)
print(ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

['Software Engineer']
Probabilities of all classes:  [[ 0.72317165  8.25769355 -0.2843185   7.17416306  3.83458056  4.87344345
  10.27976749  9.2644453   1.73122697  7.04310678 10.27669569  2.77294512]]
Probability of Predicted class :  10.279767490035844




### **(3) Random Forest Classifier**

In [36]:
rf = RandomForestClassifier(random_state = 10)
rf.fit(x_train, y_train)
rfc_y_pred = rf.predict(x_test)
rfc_cm = confusion_matrix(y_test,rfc_y_pred)
rfc_accuracy = accuracy_score(y_test,rfc_y_pred)
print("confusion matrics=",rfc_cm)
print("  ")
print("accuracy=",rfc_accuracy*10)

confusion matrics= [[ 9 13 16  3 10 14 13 13 12  9 12  8]
 [ 8 13  9  7 19  8  5  7  4 10  8  7]
 [11 10  6 12 15  9 12 12 11  8  8  9]
 [ 9  8  5 13 25  4  6  6  7  6  4  7]
 [11 13 12  8 17  9  8  4 10  8 12  9]
 [ 6  8 12  6 18  6  4 12  9  7 14 11]
 [ 8  8 10 10 21  6 13  9 11  7  7  6]
 [11  8  9  8 21 11 13  8  8  7  6  8]
 [ 7 12 10  5 16 18 12  8 10  5  5  6]
 [ 6 10 14  6  9 12 12  9  9 10  5  9]
 [10 12 10  9 12  9 20  7  6 11  9  4]
 [10  9 11  8 14 11  9  8  7  7  8  7]]
  
accuracy= 0.8761766835626359


In [36]:

ynewclass = rf.predict(userdata)
ynew = rf.predict_proba(userdata)
print(ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

['Web Developer']
Probabilities of all classes:  [[0.02 0.12 0.09 0.03 0.12 0.07 0.12 0.06 0.07 0.1  0.07 0.13]]
Probability of Predicted class :  0.13




### **Prediction for one instance**

In [58]:
userdata = [['7','6','6','8','3','5','4', '4', '7', '3', '3', '6','8', 
                    '7','5','7','4','0','1','1','0']]
ynewclass = dtree.predict(userdata)
ynew = dtree.predict_proba(userdata)
print(ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

['Web Developer']
Probabilities of all classes:  [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Probability of Predicted class :  1.0




In [40]:
import pickle 
pickle.dump(dtree,open('weights.pkl','wb'))

In [170]:
data=['5','4','3','0','yes','no','information security','testing','poor','medium','data engineering','testing','BPA','yes','Travel','Technical','smart worker','yes','no']


In [171]:
# print(data)
data=json=sys.argv[1]

['5', '4', '3', '0', 'yes', 'no', 'information security', 'testing', 'poor', 'medium', 'data engineering', 'testing', 'BPA', 'yes', 'Travel', 'Technical', 'smart worker', 'yes', 'no']


## **User data**

In [172]:
test_data=[]

test_data.append([data[0],data[1],data[2],data[3]])

In [173]:
#self learning and extra course
if(data[4]=='no'):
    test_data[0].append('0')
else:
    test_data[0].append('1')

if(data[5]=='no'):
    test_data[0].append('0')
else:
    test_data[0].append('1')        

In [15]:
category_cols = df[['certifications', 'workshops', 'Interested subjects', 'interested career area ', 'Type of company want to settle in?', 
                    'Interested Type of Books']]
                    
dict1=dic['certifications']
dict2=dic['workshops']
dict3=dic['Interested subjects']
dict4=dic['interested career area ']
dict5=dic['Type of company want to settle in?']
dict6=dic['Interested Type of Books'] 

key = list(filter(lambda x: dict1[x] == data[6], dict1))
test_data[0].append(str(key[0]))

key = list(filter(lambda x: dict2[x] == data[7], dict2))
test_data[0].append(str(key[0]))


NameError: name 'dic' is not defined

In [175]:
# for reading and memory
x=data[8]
if x=='poor':
    test_data[0].append('0')
elif x=='medium':
    test_data[0].append('1')
else:
    test_data[0].append('2')   

x=data[9]
if x=='poor':
    test_data[0].append('0')
elif x=='medium':
    test_data[0].append('1')
else:
    test_data[0].append('2')


In [176]:
# for subjects, career and company
key = list(filter(lambda x: dict3[x] == data[10], dict3))
test_data[0].append(str(key[0]))


key = list(filter(lambda x: dict4[x] == data[11], dict4))
test_data[0].append(str(key[0]))


key = list(filter(lambda x: dict5[x] == data[12], dict5))
test_data[0].append(str(key[0]))


In [177]:
# for input from seniors

if data[13]=='no':
    test_data[0].append('0')
else:
    test_data[0].append('1')


In [178]:
#for books
key = list(filter(lambda x: dict6[x] == data[14], dict6))
test_data[0].append(str(key[0]))

In [179]:
# for management and hard work
# manage tech 
# hard smart

if data[15]=='Management':
    test_data[0].append('1')
    test_data[0].append('0')
else:
    test_data[0].append('0')
    test_data[0].append('1')    


if data[16]=='Hard worker':
    test_data[0].append('1')
    test_data[0].append('0')
else:
    test_data[0].append('0')
    test_data[0].append('1')    

In [180]:
#worked in team and introvert
if data[17]=='no':
    test_data[0].append('0')
else:
    test_data[0].append('1')

if data[18]=='no':
    test_data[0].append('0')
else:
    test_data[0].append('1')        

In [192]:
ans=rf.predict(test_data)
print(ans)

['Software Quality Assurance (QA) / Testing']


