In [2]:
import pandas as pd
import os

classifier_scores = {}

# Identify the working directory and data files
working_directory = './text_classification'

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")
    
data_file = './labeled_data/qbj_2020_strata.csv'

# Read the data into a pandas dataframe
df = pd.read_csv(data_file,           # The data file being read, from the variable assignment above
                 on_bad_lines='warn', # This tells Pandas to only warn on bad lines vs causing an error
                 dtype='str')         # This tells Pandas to treat all numbers as words


In [3]:
# Show the number of rows and columns in the dataframe
df.shape

(100, 7)

In [4]:
# Convert all the text in the dataframe to lowercase
df = df.apply(lambda x: x.astype(str).str.lower())

In [5]:
# Create the independent matrix and show a sample line
x = df.drop('Disposition', axis=1)
x.sample()

Unnamed: 0,RECORD_ID,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,DEVICE_REPORT_PRODUCT_CODE,Repeating terms
19,1389952,it was reported that transmitter failed error ...,1435,no device output,qbj,transmitter failed error\n


In [6]:
# Create the dependent matrix and show a sample line
y = df[['Disposition']]
y.sample()

Unnamed: 0,Disposition
86,component failure


In [7]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [8]:
# Show the number of rows for training and testing
len(x_train), len(x_test)

(75, 25)

## Use the Dummy Classifier with Strategies

[Documentation for sklearn.dummy.DummyClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)

`strategy{“most_frequent”, “prior”, “stratified”, “uniform”, “constant”}, default=”prior”`

**“most_frequent”**: the predict method always returns the most frequent class label in the observed y argument passed to fit. The predict_proba method returns the matching one-hot encoded vector.

**“prior”**: the predict method always returns the most frequent class label in the observed y argument passed to fit (like “most_frequent”). predict_proba always returns the empirical class distribution of y also known as the empirical class prior distribution.

**“stratified”**: the predict_proba method randomly samples one-hot vectors from a multinomial distribution parametrized by the empirical class prior probabilities. The predict method returns the class label which got probability one in the one-hot vector of predict_proba. Each sampled row of both methods is therefore independent and identically distributed.

**“uniform”**: generates predictions uniformly at random from the list of unique classes observed in y, i.e. each class has equal probability.

**“constant”**: always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class.

In [9]:
# Use the DummyClassifier, iterating over the different strategies
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

strategies = ['most_frequent', 'prior', 'stratified', 'uniform', 'constant']

for strategy in strategies:
    print(f"Dummy Classifier using strategy = {strategy.upper()}\n")
    
    if strategy == 'constant':
        classifier = DummyClassifier(strategy=strategy, random_state=0, constant='component failure')
    else:
        classifier = DummyClassifier(strategy=strategy, random_state=0)
    
    classifier.fit(x_train, y_train)
        
    y_pred = classifier.predict(x_test)
    
    # Print the scores and reports for each classifer
    classifier_scores[strategy] = accuracy_score(y_test, y_pred)
    print(f"Accuracy score = {classifier_scores[strategy]}\n")
    
    print(classification_report(y_test, y_pred, zero_division=0))
    
    print('-' * 80, '\n')

Dummy Classifier using strategy = MOST_FREQUENT

Accuracy score = 0.64

                      precision    recall  f1-score   support

   component failure       0.64      1.00      0.78        16
failure inconclusive       0.00      0.00      0.00         9

            accuracy                           0.64        25
           macro avg       0.32      0.50      0.39        25
        weighted avg       0.41      0.64      0.50        25

-------------------------------------------------------------------------------- 

Dummy Classifier using strategy = PRIOR

Accuracy score = 0.64

                      precision    recall  f1-score   support

   component failure       0.64      1.00      0.78        16
failure inconclusive       0.00      0.00      0.00         9

            accuracy                           0.64        25
           macro avg       0.32      0.50      0.39        25
        weighted avg       0.41      0.64      0.50        25

-------------------------------

## Use the MultinomialNB Classifier with Vectorized Input for 'FOI_TEXT'

[Documentation for sklearn.naive_bayes.MultinomialNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

> "The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work."



In [10]:
# Create a vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', decode_error='ignore')

vectorizer.fit(x_train['FOI_TEXT'].tolist())

TfidfVectorizer(decode_error='ignore', stop_words='english')

In [11]:
from sklearn.naive_bayes import MultinomialNB

multinomialnb_classifier = MultinomialNB()
multinomialnb_classifier.fit(vectorizer.transform(x_train['FOI_TEXT'].tolist()), y_train['Disposition'].tolist())


MultinomialNB()

In [12]:
multinomialnb_y_pred = multinomialnb_classifier.predict(vectorizer.transform(x_test['FOI_TEXT'].tolist()))

In [13]:
# Print the score and report
classifier_scores['MultinomialNB'] = accuracy_score(y_test, multinomialnb_y_pred)
print(f"MultinomialNB Classifier\n")
print(f"Accuracy score = {classifier_scores['MultinomialNB']}\n")
print(classification_report(y_test, multinomialnb_y_pred, zero_division=0))

MultinomialNB Classifier

Accuracy score = 1.0

                      precision    recall  f1-score   support

   component failure       1.00      1.00      1.00        16
failure inconclusive       1.00      1.00      1.00         9

            accuracy                           1.00        25
           macro avg       1.00      1.00      1.00        25
        weighted avg       1.00      1.00      1.00        25



In [14]:
# Print the scores
df_scores = pd.DataFrame.from_dict(classifier_scores, orient='index')
df_scores.index.name = 'Classifier'
df_scores.columns = ['Score']
df_scores.sort_values(by=['Score'], ascending=False)

Unnamed: 0_level_0,Score
Classifier,Unnamed: 1_level_1
MultinomialNB,1.0
most_frequent,0.64
prior,0.64
constant,0.64
uniform,0.6
stratified,0.56


In [15]:
df_scores.to_csv(f"{working_directory}/classification_scores.csv")