### Document Classifications

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import nltk
from sklearn.metrics import classification_report
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import GridSearchCV

In [2]:
df_test = pd.read_csv(r"C:\Users\SONY\Desktop\projects of Machine Learning\Document classification\test.csv")
df_test=df_test.dropna()
df_test['text'] = df_test['text'].str.lower()
print(df_test.head())

                                                text  output
0  ref ref note open position ref ref ref closed ...       3
1  page ga temozolomide mg ct capsule ea g temozo...       2
2  moeas southwest grilla r moeas revolves honest...       3
3  medium contact catherine butler hour analyst m...       7
4  mm mm ref rd shot component nd shot component ...       3


In [3]:
df_train = pd.read_csv(r"C:\Users\SONY\Desktop\projects of Machine Learning\Document classification\train.csv")
df_train=df_train.dropna()
df_train['text']=df_train.text.str.lower()
print(df_train.head())

                                                text  output
0  form rev september department treasury interna...       5
1  form w purpose complete form w employer withho...       5
2  invoice afeel corp dba huntington brass dana c...       2
3  format ion technology telecommunication admini...       3
4  heather burtch burtchhm gmail com ellicott cit...       4


In [4]:
df_train.shape

(1003, 2)

In [5]:
df_test.shape

(329, 2)

In [6]:
df_train['output'].value_counts()

3    222
2    141
7    139
4    127
5    110
6    100
0     84
1     80
Name: output, dtype: int64

In [7]:
df_test['output'].value_counts()

7    85
3    76
4    51
2    34
1    30
5    23
0    16
6    14
Name: output, dtype: int64

In [8]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
print(stop)

{'were', 'against', "shouldn't", 'hadn', 'so', 'him', 'mightn', 'no', "you'd", 'himself', 'having', 'our', 'has', 'y', 'have', 'why', 'than', 'couldn', "won't", 'each', 'd', 'won', 'same', 'if', 'that', 'but', "doesn't", 'ours', 'very', "weren't", 'after', 'here', 'about', 'under', "needn't", 'weren', "mightn't", 'haven', 'did', 'their', 'again', 'her', 'only', 'm', 'from', 've', 'yours', 'further', 'or', 'most', 'am', "shan't", 'between', 'until', "that'll", 'them', 'yourselves', 'myself', 'doing', 'a', 'shan', 'mustn', 'some', 're', 'my', 'your', "it's", 'these', 'herself', "aren't", 'down', 'an', 'too', 'isn', 'nor', 'they', 'to', 'i', 'wasn', 'those', 'above', 'all', 'while', 'other', 'now', 'don', 'when', 'are', 'me', 'below', 'just', 'wouldn', 'by', 't', 'his', "you're", 'which', "haven't", 'will', 'with', 'on', 'hers', "hasn't", 'its', 'then', 'before', 'aren', 'during', "you'll", "don't", 'she', 'didn', 'theirs', 'themselves', 'this', 'hasn', 'the', 'it', 'off', 'and', 'few', '

In [9]:
def Text_cleaning(data):
    remove_punctuation = [char for char in data if char not in string.punctuation]
    remove_punctuation = ''.join(remove_punctuation)
    return [word for word in remove_punctuation.split() if word.lower() not in stop]

In [10]:
print(df_train.iloc[:,0].apply(Text_cleaning))

0       [form, rev, september, department, treasury, i...
1       [form, w, purpose, complete, form, w, employer...
2       [invoice, afeel, corp, dba, huntington, brass,...
3       [format, ion, technology, telecommunication, a...
4       [heather, burtch, burtchhm, gmail, com, ellico...
                              ...                        
1001    [company, report, generated, parkeon, transit,...
1002    [cxtec, north, american, office, south, bay, r...
1003    [gifford, drive, ac, spring, hill, fl, ac, ac,...
1004    [blake, w, price, hamptona, hillsa, blvd, cant...
1005    [form, w, rev, december, department, treasury,...
Name: text, Length: 1003, dtype: object


### Model Evaluation

In [11]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer = Text_cleaning).fit(df_train['text'])

In [12]:
cv.vocabulary_

{'form': 22021,
 'rev': 49389,
 'september': 51860,
 'department': 15201,
 'treasury': 58213,
 'internal': 28815,
 'revenue': 49406,
 'service': 51915,
 'application': 4245,
 'voluntary': 61830,
 'correction': 12996,
 'program': 46093,
 'vcp': 61186,
 'employee': 18624,
 'plan': 44748,
 'compliance': 12069,
 'resolution': 49131,
 'epcrs': 19177,
 'ap': 4081,
 'information': 28202,
 'instruction': 28579,
 'url': 60439,
 'omb': 41502,
 'irs': 29379,
 'use': 60525,
 'review': 49428,
 'attached': 5565,
 'procedural': 45966,
 'requirement': 49013,
 'checklist': 10394,
 'mailing': 35029,
 'submission': 55016,
 'sponsor': 53833,
 'employer': 18632,
 'single': 52687,
 'b': 6341,
 'address': 1081,
 'p': 43136,
 'box': 8082,
 'c': 8952,
 'city': 10865,
 'town': 57899,
 'state': 54402,
 'e': 17332,
 'zip': 65111,
 'code': 11480,
 'f': 20402,
 'foreign': 21979,
 'country': 13152,
 'g': 22727,
 'province': 46383,
 'county': 13158,
 'h': 24369,
 'postal': 45298,
 'identification': 26810,
 'number': 

In [13]:
text_transform = cv.transform(df_train['text'])
print(text_transform)

  (0, 417)	2
  (0, 501)	2
  (0, 509)	26
  (0, 624)	2
  (0, 782)	4
  (0, 901)	4
  (0, 1050)	1
  (0, 1058)	8
  (0, 1081)	4
  (0, 1249)	4
  (0, 1297)	2
  (0, 1302)	4
  (0, 1887)	8
  (0, 1909)	2
  (0, 2164)	2
  (0, 3107)	1
  (0, 3196)	2
  (0, 3198)	17
  (0, 3772)	2
  (0, 3773)	2
  (0, 3788)	4
  (0, 3828)	10
  (0, 4081)	5
  (0, 4082)	1
  (0, 4234)	16
  :	:
  (1002, 60280)	1
  (1002, 60439)	4
  (1002, 60525)	13
  (1002, 60538)	2
  (1002, 60606)	1
  (1002, 61001)	2
  (1002, 61475)	2
  (1002, 61581)	1
  (1002, 61634)	1
  (1002, 61638)	1
  (1002, 62059)	35
  (1002, 62060)	1
  (1002, 62120)	2
  (1002, 62166)	1
  (1002, 62515)	3
  (1002, 62939)	1
  (1002, 62940)	1
  (1002, 63074)	1
  (1002, 63099)	2
  (1002, 63101)	42
  (1002, 63554)	1
  (1002, 63562)	1
  (1002, 63795)	1
  (1002, 64404)	3
  (1002, 65111)	2


In [14]:
x = text_transform.toarray()

In [15]:
x

array([[ 0,  0,  0, ...,  0,  0,  0],
       [22,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [26,  0,  0, ...,  0,  0,  0],
       [10,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [16]:
#Tf-IDF algo -term frequency -inverse document frequency to know significant words
tfidf_transformer = TfidfTransformer().fit(text_transform)
print(tfidf_transformer)

text_tfidf = tfidf_transformer.transform(text_transform)
print(text_tfidf)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
  (0, 65111)	0.013406378357315212
  (0, 64456)	0.006374856676267749
  (0, 64404)	0.015429579014783713
  (0, 64223)	0.010868045914487737
  (0, 63562)	0.025134060163814604
  (0, 63089)	0.009558856761305575
  (0, 63074)	0.00798009036385934
  (0, 62948)	0.010868045914487737
  (0, 62394)	0.006040794390459821
  (0, 62126)	0.017310425236208564
  (0, 62060)	0.01610965089770148
  (0, 62059)	0.002442805448469604
  (0, 61830)	0.030418913647833574
  (0, 61368)	0.01170131021642944
  (0, 61186)	0.4818837483930926
  (0, 61032)	0.005409387873986971
  (0, 60538)	0.007500890400287169
  (0, 60525)	0.014309630161393682
  (0, 60439)	0.007335775336573994
  (0, 60323)	0.004003431820507266
  (0, 60054)	0.00966659072941332
  (0, 59669)	0.006025523956419643
  (0, 58811)	0.02355245683725512
  (0, 58453)	0.006374856676267749
  (0, 58428)	0.00696257751338526
  :	:
  (1002, 1286)	0.008864688185080986
  (1002, 1246)	0.008960391891656448
 

### MultinomialNB 

### Model Training

In [17]:
classifier = MultinomialNB()
model = classifier.fit(text_tfidf ,df_train['output'])

In [18]:
predictions_MNB = classifier.predict(text_tfidf)

In [19]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df_train['output'],predictions_MNB))

[[ 69   0   0  12   0   2   1   0]
 [  0  52   0  28   0   0   0   0]
 [  0   0 128  13   0   0   0   0]
 [  0   0   0 221   1   0   0   0]
 [  0   0   0   2 125   0   0   0]
 [  0   0   0   2   0 108   0   0]
 [  0   0   0   6   0   1  93   0]
 [  0   0   0  15   2   0   0 122]]


In [20]:
acc_MNB = ('accuracy score',accuracy_score(df_train['output'],predictions_MNB))
acc_MNB

('accuracy score', 0.9152542372881356)

### Model Training Using GridSearchCV 

In [49]:
parameters = [{'alpha': [1.0,2.0,3.0,4.0,5.0],
              'fit_prior': [True,False]}]

In [50]:
grid_search = GridSearchCV(estimator = classifier , 
                           param_grid = parameters,
                           n_jobs=-1,
                           scoring= 'accuracy')

In [51]:
estimator = classifier 
estimator.get_params().keys()

dict_keys(['alpha', 'class_prior', 'fit_prior'])

In [52]:
gs_model = grid_search.fit(text_tfidf ,df_train['output'])

In [53]:
predicted = gs_model.predict(text_tfidf)

In [56]:
accuracy = gs_model.best_score_

In [57]:
accuracy

0.8743768693918246

In [58]:
grid_search.best_params_

{'alpha': 1.0, 'fit_prior': False}

In [61]:
classifier = MultinomialNB(alpha= 1.0, fit_prior = False)
model = classifier.fit(text_tfidf ,df_train['output'])

In [62]:
predictions_MNB = classifier.predict(text_tfidf)

In [63]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(df_train['output'],predictions_MNB))

[[ 76   0   0   4   0   3   1   0]
 [  0  70   0   8   0   0   2   0]
 [  1   0 129  10   0   1   0   0]
 [  1   0   0 219   1   1   0   0]
 [  0   0   0   1 126   0   0   0]
 [  0   0   0   1   0 109   0   0]
 [  0   0   0   0   0   2  98   0]
 [  0   0   0   5   4   0   0 130]]


In [64]:
acc_MNB = ('accuracy score',accuracy_score(df_train['output'],predictions_MNB))
acc_MNB

('accuracy score', 0.9541375872382851)