**Importing Libraries**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

**Importing Dataset**

In [None]:
dataset = pd.read_csv('updated2.csv')

**Cleaning and Preprocessing Dataset**

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 3420):
  resume = re.sub('[^a-zA-Z]', ' ', dataset['combined'][i])
  resume = resume.lower()
  resume = resume.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  resume = [ps.stem(word) for word in resume if not word in set(all_stopwords)]
  resume = ' '.join(resume)
  corpus.append(resume)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(corpus)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



**Applying Count Vectorizer on Dataset**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

**Applying TF-IDF Vectorizer**

In [None]:
#from sklearn.feature_extraction.text import TfidfVectorizer
#tv = TfidfVectorizer()
#X = tv.fit_transform(corpus).toarray()
#y = dataset.iloc[:, -1].values

**Encoding Categorical Data**

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
y

array([ 8,  8, 18, ..., 41, 19, 19])

**Splitting the Dataset into Training set and Test set**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

**Applying Naive Bayes Classification Model**

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

**Applying Random Forest Classification Model**

In [None]:
#from sklearn.ensemble import RandomForestClassifier
#classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
#classifier.fit(X_train, y_train)

**Applying Support Vector Classification with Linear Kernel**

In [None]:
#from sklearn.svm import SVC
#classifier = SVC(kernel = 'linear', random_state = 0)
#classifier.fit(X_train, y_train)

**Applying Support Vector Classification with RBF Kernel**

In [None]:
#from sklearn.svm import SVC
#classifier = SVC(kernel = 'rbf', random_state = 0)
#classifier.fit(X_train, y_train)

**Applying Support Vector Classification with Polynomial Kernel**

In [None]:
#from sklearn.svm import SVC
#classifier = SVC(kernel = 'poly', random_state = 0)
#classifier.fit(X_train, y_train)

**Applying Support Vector Classification with Sigmoid Kernel**

In [None]:
#from sklearn.svm import SVC
#classifier = SVC(kernel = 'sigmoid', random_state = 0)
#classifier.fit(X_train, y_train)

**Decoding Categorical Data**

In [None]:
le.inverse_transform(y_test)

array(['NYC HOUSING AUTHORITY', 'DEPT OF ENVIRONMENT PROTECTION',
       'DEPT OF ENVIRONMENT PROTECTION', 'DEPARTMENT OF TRANSPORTATION',
       'DEPARTMENT OF TRANSPORTATION', 'DEPT OF HEALTH/MENTAL HYGIENE',
       'DEPARTMENT OF TRANSPORTATION', 'DEPARTMENT OF SANITATION',
       'NYC HOUSING AUTHORITY', 'DEPARTMENT OF BUSINESS SERV.',
       'DEPT OF ENVIRONMENT PROTECTION', 'DEPT OF HEALTH/MENTAL HYGIENE',
       "ADMIN FOR CHILDREN'S SVCS", 'DEPT OF ENVIRONMENT PROTECTION',
       'LAW DEPARTMENT', "ADMIN FOR CHILDREN'S SVCS",
       'DEPT OF HEALTH/MENTAL HYGIENE', 'DEPT OF HEALTH/MENTAL HYGIENE',
       'DEPT OF ENVIRONMENT PROTECTION', 'TAXI & LIMOUSINE COMMISSION',
       "ADMIN FOR CHILDREN'S SVCS", 'DEPT OF HEALTH/MENTAL HYGIENE',
       'DEPT OF ENVIRONMENT PROTECTION', 'DEPT OF HEALTH/MENTAL HYGIENE',
       'DEPT OF HEALTH/MENTAL HYGIENE', 'DEPT OF INFO TECH & TELECOMM',
       'DEPT OF ENVIRONMENT PROTECTION', 'OFFICE OF EMERGENCY MANAGEMENT',
       'HOUSING PRESERVAT

**Predicting using the Classifier**

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((le.inverse_transform(y_pred).reshape(len(y_pred),1), le.inverse_transform(y_test).reshape(len(y_test),1)),1))

[['NYC HOUSING AUTHORITY' 'NYC HOUSING AUTHORITY']
 ['DEPT OF ENVIRONMENT PROTECTION' 'DEPT OF ENVIRONMENT PROTECTION']
 ['DEPT OF ENVIRONMENT PROTECTION' 'DEPT OF ENVIRONMENT PROTECTION']
 ...
 ['HOUSING PRESERVATION & DVLPMNT' 'HOUSING PRESERVATION & DVLPMNT']
 ['DEPT OF ENVIRONMENT PROTECTION' 'DEPT OF ENVIRONMENT PROTECTION']
 ['HUMAN RIGHTS COMMISSION' 'HUMAN RIGHTS COMMISSION']]


**Making the Confusion Matrix and measuring Accuracy**

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_auc_score
cm = confusion_matrix(le.inverse_transform(y_test), le.inverse_transform(y_pred))
print(cm)
accuracy_score(y_test, y_pred)

[[26  0  0 ...  0  0  0]
 [ 0  9  0 ...  0  0  0]
 [ 0  0  1 ...  0  0  0]
 ...
 [ 0  0  0 ...  5  0  0]
 [ 0  0  0 ...  0  5  0]
 [ 0  0  0 ...  0  0 11]]


0.9166666666666666

**Measuring Precision**

In [None]:
precision_score(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


0.9239436337838665

**Measuring Recall**

In [None]:
recall_score(y_test, y_pred, average='weighted')

0.9166666666666666

**Measuring F1 Score**

In [None]:
f1_score(y_test, y_pred, average='weighted')

0.9127584295689078

In [None]:
#pred = classifier.predict_proba(X_test)
#log_loss(y_test,y_pred)

In [None]:
#roc_auc_score(y_test, y_pred, average = 'weighted', multi_class= 'ovr')

**Measuring Hamming Loss**

In [None]:
from sklearn.metrics import hamming_loss
hamming_loss(y_test, y_pred)

0.08333333333333333