# Import important library

In [13]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# Load dataset

In [14]:
job_title = pd.read_csv('Job titles and industries.csv')

print(job_title.shape)
job_title.head()

(8586, 2)


Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor - co...,IT
1,senior technical support engineer,IT
2,head of it services,IT
3,js front end engineer,IT
4,network and telephony controller,IT


In [15]:
job_title['industry'].value_counts()

IT             4746
Marketing      2031
Education      1435
Accountancy     374
Name: industry, dtype: int64

In [16]:
job_title['industry'].value_counts()

IT             4746
Marketing      2031
Education      1435
Accountancy     374
Name: industry, dtype: int64

In [17]:
job_title.isna().sum()

job title    0
industry     0
dtype: int64

In [18]:
job_title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8586 entries, 0 to 8585
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job title  8586 non-null   object
 1   industry   8586 non-null   object
dtypes: object(2)
memory usage: 134.3+ KB


In [20]:
job_title['job title'] = job_title['job title'].astype("string")
job_title['industry'] = job_title['industry'].astype("string")
job_title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8586 entries, 0 to 8585
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   job title  8586 non-null   string
 1   industry   8586 non-null   string
dtypes: string(2)
memory usage: 134.3 KB


# split dataset into train-test 

In [21]:
data_randomized = job_title.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(6869, 2)
(1717, 2)


In [22]:
training_set['industry'].value_counts(normalize=True) #show class counts in train set

IT             0.554375
Marketing      0.235551
Education      0.166400
Accountancy    0.043674
Name: industry, dtype: float64

In [23]:
test_set['industry'].value_counts(normalize=True) #show class counts in train set

IT             0.546302
Marketing      0.240536
Education      0.170064
Accountancy    0.043098
Name: industry, dtype: float64

In [24]:
test_set['industry'].unique()

<StringArray>
['IT', 'Marketing', 'Education', 'Accountancy']
Length: 4, dtype: string

# preprocesssing 

In [25]:
training_set['job title'] = training_set['job title'].str.replace('\W', ' ') # Removes punctuation
training_set.head()

Unnamed: 0,job title,industry
0,it support,IT
1,junior net developer,IT
2,graduate software engineer,IT
3,sports coach in coventry,Education
4,1 1 sen ta asd ta lsa needed in ealing a...,Education


In [26]:
test_set['job title'] = test_set['job title'].str.replace('\W', ' ') # Removes punctuation
test_set.head()

Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor co...,IT
1,senior php developer,Marketing
2,website owner,Marketing
3,research administrator,Marketing
4,client onboarding support,Marketing


In [27]:
# convert sentences into bag of words 
count_vect = CountVectorizer(analyzer='word', 
                              token_pattern=r'\b[a-zA-Z]{3,}\b',  
                              ngram_range=(1, 1)) #unigram only

tfidf_transformer = TfidfTransformer()

#convert sentence into bag of words using count vectorizer 
X_train_counts = count_vect.fit_transform(training_set['job title'])

#convert count matrix to a normalized term frequency 
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_train_tfidf.shape

(6869, 1671)

# Models

#### Naive bayes

In [28]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, training_set['industry'])

In [29]:
 from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(analyzer='word', token_pattern=r'\b[a-zA-Z]{3,}\b',  ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=0.01))])
text_clf = text_clf.fit(training_set['job title'], training_set['industry'])

In [30]:
predicted = text_clf.predict(test_set['job title'])
print("Accuracy",np.mean(predicted == test_set['industry']))

Accuracy 0.918462434478742


### SVM

In [31]:
text_svm = Pipeline([('vect', CountVectorizer(analyzer='word', token_pattern=r'\b[a-zA-Z]{3,}\b',  ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC(C=10,kernel='rbf',class_weight='balanced'))])

text_svm = text_svm.fit(training_set['job title'], training_set['industry'])

In [32]:
target_names = ['IT', 'Marketing', 'Education', 'Accountancy']

predicted_svm = text_svm.predict(test_set['job title'])
print("Accuracy",accuracy_score(test_set['industry'],predicted_svm))
print(confusion_matrix(test_set['industry'],predicted_svm,labels=target_names))
print(classification_report(test_set['industry'],predicted_svm,target_names=target_names))

Accuracy 0.9411764705882353
[[911  24   2   1]
 [ 28 375  10   0]
 [  7  14 270   1]
 [  5   7   2  60]]
              precision    recall  f1-score   support

          IT       0.97      0.81      0.88        74
   Marketing       0.95      0.92      0.94       292
   Education       0.96      0.97      0.96       938
 Accountancy       0.89      0.91      0.90       413

    accuracy                           0.94      1717
   macro avg       0.94      0.90      0.92      1717
weighted avg       0.94      0.94      0.94      1717



In [33]:
# grid search 
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                   'clf__C' :[10,20,40,50]}
gs_clf_svm = GridSearchCV(text_svm, parameters_svm, n_jobs=-1)
gs_clf_svm.fit(training_set['job title'], training_set['industry'])

GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(token_pattern='\\b[a-zA-Z]{3,}\\b')),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        SVC(C=10, class_weight='balanced'))]),
             n_jobs=-1,
             param_grid={'clf__C': [10, 20, 40, 50],
                         'vect__ngram_range': [(1, 1), (1, 2)]})

In [34]:
gs_clf_svm.best_score_
gs_clf_svm.best_params_

{'clf__C': 10, 'vect__ngram_range': (1, 1)}

In [36]:
text_svm.predict(['php developer'])

array(['IT'], dtype=object)

In [37]:
!pip install joblib



# Save model

In [38]:
import joblib

joblib.dump(text_svm,'svm_model.pkl')

['svm_model.pkl']

In [39]:
filePath = 'svm_model.pkl'
#open file
file = open(filePath, "rb")
#load the trained model
trained_model = joblib.load(file)

In [42]:
prediction = trained_model.predict(['.net developer'])
print(prediction)

['IT']


# deploy model using flask

In [62]:
from flask import Flask,render_template
from flask_restful import Api, Resource, reqparse
import requests
from joblib import dump, load
app = Flask(__name__)
api = Api(app)

# Only one Endpoint to API supported: {GET}
class Model(Resource):
    def get(self, job_title):
        model = load('svm_model.pkl')
        pred_industry = model.predict([job_title])
        return pred_industry[0], 200

# Adding routes to the Application and Endpoints to App.
api.add_resource(Model, "/model/api/<string:job_title>")
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [20/Sep/2021 16:57:54] "[33mGET / HTTP/1.1[0m" 404 -
127.0.0.1 - - [20/Sep/2021 16:58:18] "[33mGET / HTTP/1.1[0m" 404 -


In [65]:
from flask import Flask, render_template

app = Flask(__name__)

@app.route('/')
def home():
    return render_template('home.html')
if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Restarting with windowsapi reloader


SystemExit: 1