<h2>Text Classification Model</h2>

<h4>Preprocessing</h4>

In [1]:
# packages used
import pdftotext
import json
import os
import glob
import pandas as pd
import numpy as np

In [2]:
old_pdfs = glob.glob('docs1/raw_data/*.pdf')
new_pdfs = glob.glob('docs1/new/*.pdf')

In [5]:
print(len(old_pdfs), len(new_pdfs))

276 100


In [3]:
cat = [] # categories
filenames = [] # files
for pdf in old_pdfs+new_pdfs:
    filenames.append(pdf)
    if 'Account Statement' in pdf:
        cat.append(1)
    elif 'Distribution Notice' in pdf:
        cat.append(2)
    elif 'Call Notice' in pdf:
        cat.append(3)
    else:
        cat.append(0)

In [10]:
print(len(cat), len(filenames))

376 376


In [4]:
text = [] # text extracted from files
for file in filenames:
    content = ''
    with open(file, 'rb') as f:
        pages = pdftotext.PDF(f)
        for page in pages:
            content += page
    text.append(content)

In [5]:
with open('500 Account Statements Extracted Data.json', 'r') as f:
    extracted_data = f.read()
    
extracted_data = json.loads(extracted_data)
for file in extracted_data:
    if file['subCategory'] == 'Account Statement' in pdf:
        cat.append(1)
    elif file['subCategory'] == 'Distribution Notice' in pdf:
        cat.append(2)
    elif file['subCategory'] == 'Call Notice' in pdf:
        cat.append(3)
    else:
        cat.append(0)

In [6]:
for file in extracted_data:
    content = ''
    with open('docs/' + file['id'] + '.pdf', 'rb') as f:
        pages = pdftotext.PDF(f)
        for page in pages:
            content += page
    text.append(content)

In [15]:
print(len(cat), len(text))

876 876


In [7]:
d = {'category': cat, 'text': text}
df = pd.DataFrame(data=d)

<h4>Train and Test</h4>

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [9]:
text_list = df['text'].tolist()
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(text_list)

In [10]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [12]:
varietal_list = df['category'].tolist()
train_x, test_x, train_y, test_y = train_test_split(x_train_tfidf, varietal_list, test_size=0.3)

#### SVM

In [13]:
clf = SVC(kernel='linear', class_weight='balanced').fit(train_x, train_y)
y_score = clf.predict(test_x)

In [14]:
np.mean(y_score == test_y)

0.8859315589353612

In [18]:
# training error
np.mean(clf.predict(train_x)==train_y)

0.9119086460032626

In [24]:
f1_score(test_y, y_score, average="weighted")

0.8758627981375543

#### Naive Bayes

In [16]:
clf_NB = MultinomialNB().fit(train_x, train_y)
y_NB = clf_NB.predict(test_x)

In [17]:
np.mean(y_NB == test_y)

0.8212927756653993

In [19]:
# training error
np.mean(clf_NB.predict(train_x)==train_y)

0.8646003262642741

In [32]:
f1_score(test_y, y_NB, average="weighted")

  'precision', 'predicted', average, warn_for)


0.7918003746854526

#### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
clf_RF = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=0).fit(train_x, train_y)
y_RF = clf_RF.predict(test_x)

In [24]:
clf_RF

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=None, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [22]:
np.mean(y_RF == test_y)

0.8365019011406845

In [23]:
# training error
np.mean(clf_RF.predict(train_x)==train_y)

0.9510603588907015

In [33]:
f1_score(test_y, y_RF, average="weighted")

0.8321877725539273

#### KNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
neigh = KNeighborsClassifier(n_neighbors=5).fit(train_x, train_y)
y_KNN = neigh.predict(test_x)

In [29]:
np.mean(y_KNN == test_y)

0.8479087452471483

In [31]:
# training error
np.mean(neigh.predict(train_x)==train_y)

0.9200652528548124

In [34]:
f1_score(test_y, y_KNN, average="weighted")

0.8454150721774074