In [1]:
import pandas as pd
import os

classifier_scores = {}

# Identify the working directory and data files
working_directory = './text_classification'

# Create the working directory if needed
try:
    os.makedirs(working_directory, exist_ok=True)
except OSError as error:
    print(f"Error creating {working_directory}: {error}")
    
data_file = './labeled_data/qbj_2020_strata_warranty.csv'

# Read the data into a pandas dataframe
df = pd.read_csv(data_file,           # The data file being read, from the variable assignment above
                 on_bad_lines='warn', # This tells Pandas to only warn on bad lines vs causing an error
                 dtype='str')         # This tells Pandas to treat all numbers as words


In [2]:
# Show the number of rows and columns in the dataframe
df.shape

(100, 8)

In [3]:
# Convert all the text in the dataframe to lowercase
df = df.apply(lambda x: x.astype(str).str.lower())

In [4]:
# Create the independent matrix and show a sample line
x = df.drop('Warranty', axis=1)
x.sample()

Unnamed: 0.1,Unnamed: 0,FOI_TEXT,DEVICE_PROBLEM_CODE,DEVICE_PROBLEM_TEXT,DEVICE_REPORT_PRODUCT_CODE,Repeating terms,Disposition
50,1015850,it was reported that early sensor expiration o...,1480,premature end-of-life indicator,qbj,early sensor expiration\n,component failure


In [5]:
# Create the dependent matrix and show a sample line
y = df[['Warranty']]
y.sample()

Unnamed: 0,Warranty
17,not replaced


In [6]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [7]:
# Show the number of rows for training and testing
len(x_train), len(x_test)

(75, 25)

## Use the Dummy Classifier with Strategies

[Documentation for sklearn.dummy.DummyClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)

`strategy{“most_frequent”, “prior”, “stratified”, “uniform”, “constant”}, default=”prior”`

**“most_frequent”**: the predict method always returns the most frequent class label in the observed y argument passed to fit. The predict_proba method returns the matching one-hot encoded vector.

**“prior”**: the predict method always returns the most frequent class label in the observed y argument passed to fit (like “most_frequent”). predict_proba always returns the empirical class distribution of y also known as the empirical class prior distribution.

**“stratified”**: the predict_proba method randomly samples one-hot vectors from a multinomial distribution parametrized by the empirical class prior probabilities. The predict method returns the class label which got probability one in the one-hot vector of predict_proba. Each sampled row of both methods is therefore independent and identically distributed.

**“uniform”**: generates predictions uniformly at random from the list of unique classes observed in y, i.e. each class has equal probability.

**“constant”**: always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class.

In [8]:
# Use the DummyClassifier, iterating over the different strategies
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

strategies = ['most_frequent', 'prior', 'stratified', 'uniform', 'constant']

for strategy in strategies:
    print(f"Dummy Classifier using strategy = {strategy.upper()}\n")
    
    if strategy == 'constant':
        classifier = DummyClassifier(strategy=strategy, random_state=1, constant='replaced')
    else:
        classifier = DummyClassifier(strategy=strategy, random_state=1)
    
    classifier.fit(x_train, y_train)
        
    y_pred = classifier.predict(x_test)
    
    # Print the scores and reports for each classifer
    classifier_scores[strategy] = accuracy_score(y_test, y_pred)
    print(f"Accuracy score = {classifier_scores[strategy]}\n")
    
    print(classification_report(y_test, y_pred, zero_division=0))
    
    print('-' * 80, '\n')

Dummy Classifier using strategy = MOST_FREQUENT

Accuracy score = 0.88

              precision    recall  f1-score   support

not replaced       0.88      1.00      0.94        22
    replaced       0.00      0.00      0.00         3

    accuracy                           0.88        25
   macro avg       0.44      0.50      0.47        25
weighted avg       0.77      0.88      0.82        25

-------------------------------------------------------------------------------- 

Dummy Classifier using strategy = PRIOR

Accuracy score = 0.88

              precision    recall  f1-score   support

not replaced       0.88      1.00      0.94        22
    replaced       0.00      0.00      0.00         3

    accuracy                           0.88        25
   macro avg       0.44      0.50      0.47        25
weighted avg       0.77      0.88      0.82        25

-------------------------------------------------------------------------------- 

Dummy Classifier using strategy = STRATIFIED

## Use the MultinomialNB Classifier with Vectorized Input for 'FOI_TEXT'

[Documentation for sklearn.naive_bayes.MultinomialNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

> "The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work."



In [9]:
# Create a vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', decode_error='ignore')

vectorizer.fit(x_train['FOI_TEXT'].tolist())

In [10]:
from sklearn.naive_bayes import MultinomialNB

multinomialnb_classifier_warranty = MultinomialNB()
multinomialnb_classifier_warranty.fit(vectorizer.transform(x_train['FOI_TEXT'].tolist()), y_train['Warranty'].tolist())


In [11]:
multinomialnb_y_pred = multinomialnb_classifier_warranty.predict(vectorizer.transform(x_test['FOI_TEXT'].tolist()))

In [12]:
# Print the score and report
classifier_scores['MultinomialNB'] = accuracy_score(y_test, multinomialnb_y_pred)
print(f"MultinomialNB Classifier\n")
print(f"Accuracy score = {classifier_scores['MultinomialNB']}\n")
print(classification_report(y_test, multinomialnb_y_pred, zero_division=0))

MultinomialNB Classifier

Accuracy score = 0.96

              precision    recall  f1-score   support

not replaced       1.00      0.95      0.98        22
    replaced       0.75      1.00      0.86         3

    accuracy                           0.96        25
   macro avg       0.88      0.98      0.92        25
weighted avg       0.97      0.96      0.96        25



In [13]:
# Print the scores
df_scores = pd.DataFrame.from_dict(classifier_scores, orient='index')
df_scores.index.name = 'Classifier'
df_scores.columns = ['Accuracy Score']
df_scores.sort_values(by=['Accuracy Score'], ascending=False)

Unnamed: 0_level_0,Accuracy Score
Classifier,Unnamed: 1_level_1
MultinomialNB,0.96
most_frequent,0.88
prior,0.88
stratified,0.68
uniform,0.44
constant,0.12


In [14]:
df_scores.to_csv(f"{working_directory}/classification_scores.csv")

## Save the Model Using Python's `pickle` Library
After the model has been trained, it can be saved for later use. This provides for more efficient analysis by not having to train the model each time a model is needed.

Also, a model that shows optimal results can be preserved for deployment in the future.

[This video](https://www.youtube.com/watch?v=2Tw39kZIbhs&list=PLQVvvaa0QuDe8XSftW-RAxdo6OmaeL85M&t=11s) provides an example of saving a Python variable using pickle.

In [15]:
import pickle
import datetime

pickle_out = open(f"{working_directory}/multinomialnb_classifier_warranty_{datetime.datetime.now():%Y-%m-%d-%s}.pickle", "wb")

pickle.dump(multinomialnb_classifier_warranty, pickle_out)

pickle_out.close()