In [1]:
# imports and set up dataframes
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
from nltk.corpus import stopwords
import re
from pathlib import Path

df_csv = Path('hw_exercise.csv')
df = pd.read_csv(df_csv)
df.head()

Unnamed: 0,transaction_descriptor,store_number,dataset
0,DOLRTREE 2257 00022574 ROSWELL,2257,train
1,AUTOZONE #3547,3547,train
2,TGI FRIDAYS 1485 0000,1485,train
3,BUFFALO WILD WINGS 003,3,train
4,J. CREW #568 0,568,train


In [2]:
# get lemmatizer and stop words
wnl = WordNetLemmatizer()
stop = stopwords.words('english')


# Create the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    
    # Create a list of the words
    words = word_tokenize(text)

    # Convert the words to lowercase
    words = list(filter(lambda w: w.lower(), words))
    
    # Remove the punctuation
    words = list(filter(lambda t: t not in punctuation, words))
    
    # Remove the stopwords
    words = list(filter(lambda t: t.lower() not in stop, words))
    
    # Lemmatize Words into root words
    tokens = [wnl.lemmatize(word) for word in words]
    
    return tokens

In [3]:
# create a 'tokens' column with a tokenized version of the 'transaction_descriptor' column
df['tokens'] = df.transaction_descriptor.apply(tokenizer)
df.head()

Unnamed: 0,transaction_descriptor,store_number,dataset,tokens
0,DOLRTREE 2257 00022574 ROSWELL,2257,train,"[DOLRTREE, 2257, 00022574, ROSWELL]"
1,AUTOZONE #3547,3547,train,"[AUTOZONE, 3547]"
2,TGI FRIDAYS 1485 0000,1485,train,"[TGI, FRIDAYS, 1485, 0000]"
3,BUFFALO WILD WINGS 003,3,train,"[BUFFALO, WILD, WINGS, 003]"
4,J. CREW #568 0,568,train,"[J, CREW, 568, 0]"


In [61]:
# setting x and y
y = df["store_number"]
X = df['tokens']

In [62]:
# filtering for numbers
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

filtered_tokens = []
for i, tokens in enumerate(df["tokens"]):
    filtered_token = []
    for word in tokens:
        if hasNumbers(word):
            filtered_token.append(word)
    filtered_tokens.append(filtered_token)
df["tokens"] = filtered_tokens

In [63]:
# making everything in 'tokens' column either a string or int
labels = []
for i, tokens in enumerate(df["tokens"]):
    label = []
    for word in tokens:
        try: 
            label.append(int(word) == int(df["store_number"][i])) # cast to integer if possible
        except:
            label.append(str(word) == str(df["store_number"][i])) # otherwise keep as a string
    labels.append(label)

In [8]:
print(labels)

[[True, False], [True], [True, False], [True], [True, False], [True], [True, False], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True, False], [True], [True], [True], [True, False], [True], [True], [True], [True], [True], [True], [True, False], [True], [True], [True], [False], [True], [True], [True], [True], [True], [True], [False], [True], [True, False], [True], [True], [False], [True], [True], [True], [True], [True, False], [True], [True], [True, False], [True, False], [True, False], [True], [True], [True], [True, False], [True, False], [True], [False], [True], [True], [True], [False], [False], [True], [True], [True, False], [True], [True], [False], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True], [True, False], [False, True], [False], [True], [True], [True], [True], [True], [False, True], [True], [False, False], [True], [False], [True], [True, False], [True], [True], [

In [64]:
# creating the labels column
df['labels'] = labels

filtered_tokens = []
for i, tokens in enumerate(df["tokens"]):
    filtered_token = []
    for word in tokens:
        filtered_token.append(re.sub('[^0-9]','', word))
    filtered_tokens.append(filtered_token)
df["tokens"] = filtered_tokens

In [10]:
df

Unnamed: 0,transaction_descriptor,store_number,dataset,tokens,labels
0,DOLRTREE 2257 00022574 ROSWELL,2257,train,"[2257, 00022574]","[True, False]"
1,AUTOZONE #3547,3547,train,[3547],[True]
2,TGI FRIDAYS 1485 0000,1485,train,"[1485, 0000]","[True, False]"
3,BUFFALO WILD WINGS 003,3,train,[003],[True]
4,J. CREW #568 0,568,train,"[568, 0]","[True, False]"
...,...,...,...,...,...
295,MCDONALD'S F2151,F2151,test,[2151],[True]
296,NST BEST BUY #1403 332411,1403,test,"[1403, 332411]","[True, False]"
297,CVS/PHARMACY #06689,6689,test,[06689],[True]
298,BANANA REPUBLIC #8109,8109,test,[8109],[True]


In [65]:
#training and testing for ML model
import numpy as np

df_train = df[df['dataset']=='train']
df_test = df[df['dataset']=='test']
df_validate = df[df['dataset']=='validate']

def flatten_labels(df):
  df = df.reset_index()
  tokens1d = []
  labels1d = []
  for i in range(len(df)):
    for token in df["tokens"][i]:
      tokens1d.append(int(token))
    for label in df["labels"][i]:
      labels1d.append(label)
  tokens1d = np.array(tokens1d).reshape(-1, 1)
  labels1d = np.array(labels1d)
  return tokens1d,labels1d

X_train,y_train = flatten_labels(df_train)
X_test,y_test = flatten_labels(df_test)

In [66]:
y_train.shape

(119,)

In [67]:
# Using a random forest classifier
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Execute LR predictions
predictions = classifier.predict(X_train)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_train}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,True,True
1,False,False
2,True,True
3,True,True
4,False,False


In [68]:
# accuracy score of y_train
from sklearn.metrics import classification_report
target_names = ["incorrect", "correct"]
print(classification_report(y_train, predictions, target_names=target_names))

              precision    recall  f1-score   support

   incorrect       1.00      1.00      1.00        29
     correct       1.00      1.00      1.00        90

    accuracy                           1.00       119
   macro avg       1.00      1.00      1.00       119
weighted avg       1.00      1.00      1.00       119



In [69]:
# accuracy score of y_test
test_pred = classifier.predict(X_test)

target_names = ["incorrect", "correct"]
print(classification_report(y_test, test_pred, target_names=target_names))

              precision    recall  f1-score   support

   incorrect       0.36      0.26      0.30        39
     correct       0.71      0.80      0.75        89

    accuracy                           0.63       128
   macro avg       0.53      0.53      0.52       128
weighted avg       0.60      0.63      0.61       128



In [70]:
# predicting what is in the row and using regex
def predict_row(df):
  X = []
  for token in df['tokens']:
    if hasNumbers(token):
      X.append(re.sub('[^0-9]','',token))
  preds = classifier.predict_proba(np.array(X).reshape(-1,1))
  highest = np.argmax(preds[:,1])
  print(X, highest, preds)
  return X[highest]

In [53]:
predict_row(df_test.iloc[10])

['2923'] 0 [[0.35 0.65]]


'2923'

In [54]:
df_test.iloc[10]

transaction_descriptor    WM SUPERCENTER #2923
store_number                              2923
dataset                                   test
tokens                                  [2923]
labels                                  [True]
Name: 210, dtype: object

In [71]:
# The prediction and the models confidence in it
predictions = []
for i in range(len(df_test)):
  predictions.append(predict_row(df_test.iloc[i]))
true_vals = df_test["store_number"]

['242'] 0 [[0. 1.]]
['9442088'] 0 [[0. 1.]]
['1419'] 0 [[0. 1.]]
['1019'] 0 [[0.65 0.35]]
['38'] 0 [[0. 1.]]
['0673', '06'] 1 [[0.22 0.78]
 [0.03 0.97]]
['629'] 0 [[0.09 0.91]]
['4249', '1475'] 1 [[0.03 0.97]
 [0.   1.  ]]
['2505', '3454'] 1 [[0.73 0.27]
 [0.01 0.99]]
['825'] 0 [[0.22 0.78]]
['2923'] 0 [[0.34 0.66]]
['058'] 0 [[0.6 0.4]]
['2039'] 0 [[0.68 0.32]]
['382'] 0 [[0. 1.]]
['012260'] 0 [[0.01 0.99]]
['864'] 0 [[0.17 0.83]]
['0338'] 0 [[0. 1.]]
['011'] 0 [[0.03 0.97]]
['70360265'] 0 [[0. 1.]]
['4393', '0'] 0 [[0.06 0.94]
 [0.94 0.06]]
['62'] 0 [[0.6 0.4]]
['7212'] 0 [[0. 1.]]
['8644346', '30', '96'] 0 [[0.   1.  ]
 [0.01 0.99]
 [0.14 0.86]]
['130571'] 0 [[0.24 0.76]]
['14178'] 0 [[0.67 0.33]]
['3220', '282163'] 1 [[0.22 0.78]
 [0.07 0.93]]
['407'] 0 [[0. 1.]]
['14'] 0 [[0.66 0.34]]
['2454', '1033'] 1 [[0.64 0.36]
 [0.25 0.75]]
['17871401'] 0 [[0. 1.]]
['3192'] 0 [[0.22 0.78]]
['862751'] 0 [[0.31 0.69]]
['1'] 0 [[0.94 0.06]]
['050161', '0003'] 1 [[0.12 0.88]
 [0.03 0.97]]
['3889