In [1]:
import os
import re
import nltk
import numpy as np
import pandas as pd
from string import digits

# get data online
import requests
from bs4 import BeautifulSoup

# data normalization dependences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer

# models
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier,ExtraTreesClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline,Pipeline

# model evaluation metrics

from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score,precision_score,f1_score

# visualization dependences
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

# print the input directory available files
#print(os.listdir("../input/mbti-type"))

import warnings
warnings.filterwarnings("ignore")

In [7]:
df = pd.read_csv('BankStatmentLabelling.csv')
df.head()

Unnamed: 0,Transaction Date,Posting Date,Description,Debits,Credits,Category,Unnamed: 6,CATEGORIES,Unnamed: 8
0,2020/02/10,2020/02/10,MTN DATA BUNDLES,499.0,,COMMUNICATION,,HOME,
1,2020/02/08,2020/02/09,APPLE.COM/BILL CORK IE,74.99,,ENTERTAINMENT & EAT OUT,,FOOD,
2,2020/02/07,2020/02/09,PARKHURST HARDWARE RE PARKHURST ZA,75.0,,HOME,,TRANSPORT,
3,2020/02/07,2020/02/08,SEATTLE RENNIES HOUSE JOHANNESBURG ZA,31.5,,ENTERTAINMENT & EAT OUT,,LIFE AND HEALTH,
4,2020/02/07,2020/02/08,FLM RENNIE HOUSE EATER BRAAMFONTEIN ZA,29.98,,ENTERTAINMENT & EAT OUT,,COMMUNICATION,


In [9]:
df.rename(columns = {'Transaction Date':'Transaction_Date','Posting Date':'Posting_Date'}, inplace = True)
df.head()

Unnamed: 0,Transaction_Date,Posting_Date,Description,Debits,Credits,Category,Unnamed: 6,CATEGORIES,Unnamed: 8
0,2020/02/10,2020/02/10,MTN DATA BUNDLES,499.0,,COMMUNICATION,,HOME,
1,2020/02/08,2020/02/09,APPLE.COM/BILL CORK IE,74.99,,ENTERTAINMENT & EAT OUT,,FOOD,
2,2020/02/07,2020/02/09,PARKHURST HARDWARE RE PARKHURST ZA,75.0,,HOME,,TRANSPORT,
3,2020/02/07,2020/02/08,SEATTLE RENNIES HOUSE JOHANNESBURG ZA,31.5,,ENTERTAINMENT & EAT OUT,,LIFE AND HEALTH,
4,2020/02/07,2020/02/08,FLM RENNIE HOUSE EATER BRAAMFONTEIN ZA,29.98,,ENTERTAINMENT & EAT OUT,,COMMUNICATION,


In [13]:
def expand_text(df):
    df=df.copy()
    def convert(text):
        text=text.lower()
#         for key,val in contractions.items():
#             if key in text:
#                 text = text.replace(key,val)
        return text
    df['Description']=df['Description'].apply(convert)
    df['Category']=df['Category'].apply(convert)
#     df['CATEGORIES']=df['CATEGORIES'].apply(convert)
    return df
df = expand_text(df)
df.head()

Unnamed: 0,Transaction_Date,Posting_Date,Description,Debits,Credits,Category,Unnamed: 6,CATEGORIES,Unnamed: 8
0,2020/02/10,2020/02/10,mtn data bundles,499.0,,communication,,HOME,
1,2020/02/08,2020/02/09,apple.com/bill cork ie,74.99,,entertainment & eat out,,FOOD,
2,2020/02/07,2020/02/09,parkhurst hardware re parkhurst za,75.0,,home,,TRANSPORT,
3,2020/02/07,2020/02/08,seattle rennies house johannesburg za,31.5,,entertainment & eat out,,LIFE AND HEALTH,
4,2020/02/07,2020/02/08,flm rennie house eater braamfontein za,29.98,,entertainment & eat out,,COMMUNICATION,


In [15]:
# Remove specific punctuation python
table = str.maketrans("~!)\/><`^%$#@+=&*:;_-.,'[{?]|}(",31*" ")
df['Description']=df['Description'].apply(lambda x: x.translate(table))
df['Category']=df['Category'].apply(lambda x: x.translate(table))
# remove digits 
remove_digits = str.maketrans('', '', digits)
df.Category = df.Category.apply(lambda x: x.translate(remove_digits))
df.Description = df.Description.apply(lambda x: x.translate(remove_digits))
df.head()

Unnamed: 0,Transaction_Date,Posting_Date,Description,Debits,Credits,Category,Unnamed: 6,CATEGORIES,Unnamed: 8
0,2020/02/10,2020/02/10,mtn data bundles,499.0,,communication,,HOME,
1,2020/02/08,2020/02/09,apple com bill cork ie,74.99,,entertainment eat out,,FOOD,
2,2020/02/07,2020/02/09,parkhurst hardware re parkhurst za,75.0,,home,,TRANSPORT,
3,2020/02/07,2020/02/08,seattle rennies house johannesburg za,31.5,,entertainment eat out,,LIFE AND HEALTH,
4,2020/02/07,2020/02/08,flm rennie house eater braamfontein za,29.98,,entertainment eat out,,COMMUNICATION,


In [17]:
import itertools
df['Description']=df['Description'].apply(lambda x: ''.join(''.join(s)[:2] for _, s in itertools.groupby(x)))
df.head()

Unnamed: 0,Transaction_Date,Posting_Date,Description,Debits,Credits,Category,Unnamed: 6,CATEGORIES,Unnamed: 8
0,2020/02/10,2020/02/10,mtn data bundles,499.0,,communication,,HOME,
1,2020/02/08,2020/02/09,apple com bill cork ie,74.99,,entertainment eat out,,FOOD,
2,2020/02/07,2020/02/09,parkhurst hardware re parkhurst za,75.0,,home,,TRANSPORT,
3,2020/02/07,2020/02/08,seattle rennies house johannesburg za,31.5,,entertainment eat out,,LIFE AND HEALTH,
4,2020/02/07,2020/02/08,flm rennie house eater braamfontein za,29.98,,entertainment eat out,,COMMUNICATION,


In [18]:
# most common words in the df
freq_df = pd.Series(' '.join(df['Description']).split()).value_counts()[:10]
print('train data: \n',freq_df)

train data: 
 uber     166
sa       160
m        152
hel      137
fee      109
c         86
nov       81
t         70
intl      65
trans     65
dtype: int64


In [20]:
df['Description']=df['Description'].apply(lambda x: " ".join(x.split()))
df.head()

Unnamed: 0,Transaction_Date,Posting_Date,Description,Debits,Credits,Category,Unnamed: 6,CATEGORIES,Unnamed: 8
0,2020/02/10,2020/02/10,mtn data bundles,499.0,,communication,,HOME,
1,2020/02/08,2020/02/09,apple com bill cork ie,74.99,,entertainment eat out,,FOOD,
2,2020/02/07,2020/02/09,parkhurst hardware re parkhurst za,75.0,,home,,TRANSPORT,
3,2020/02/07,2020/02/08,seattle rennies house johannesburg za,31.5,,entertainment eat out,,LIFE AND HEALTH,
4,2020/02/07,2020/02/08,flm rennie house eater braamfontein za,29.98,,entertainment eat out,,COMMUNICATION,


In [21]:
wordnet_lemmatizer = WordNetLemmatizer()
def my_tokenizer(tokens):
    tokens = nltk.tokenize.word_tokenize(tokens)
    tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens if len(token)>2]
    tokens = " ".join(tokens)
    return tokens
df.Description = df.Description.apply(lambda x: my_tokenizer(x))
df.head()

Unnamed: 0,Transaction_Date,Posting_Date,Description,Debits,Credits,Category,Unnamed: 6,CATEGORIES,Unnamed: 8
0,2020/02/10,2020/02/10,mtn data bundle,499.0,,communication,,HOME,
1,2020/02/08,2020/02/09,apple com bill cork,74.99,,entertainment eat out,,FOOD,
2,2020/02/07,2020/02/09,parkhurst hardware parkhurst,75.0,,home,,TRANSPORT,
3,2020/02/07,2020/02/08,seattle rennies house johannesburg,31.5,,entertainment eat out,,LIFE AND HEALTH,
4,2020/02/07,2020/02/08,flm rennie house eater braamfontein,29.98,,entertainment eat out,,COMMUNICATION,


In [22]:
df = df.drop(['Unnamed: 6','Unnamed: 8','CATEGORIES','Transaction_Date'], axis=1)
df.head()

Unnamed: 0,Posting_Date,Description,Debits,Credits,Category
0,2020/02/10,mtn data bundle,499.0,,communication
1,2020/02/09,apple com bill cork,74.99,,entertainment eat out
2,2020/02/09,parkhurst hardware parkhurst,75.0,,home
3,2020/02/08,seattle rennies house johannesburg,31.5,,entertainment eat out
4,2020/02/08,flm rennie house eater braamfontein,29.98,,entertainment eat out


In [24]:
y = df.Category.values
X = df.drop(['Category'], axis=1)

In [25]:
X_transformed = pd.get_dummies(X, sparse=False, drop_first=True)
X_transformed.head()

Unnamed: 0,Posting_Date_01-Oct-19,Posting_Date_01/01/2020,Posting_Date_02-Dec-19,Posting_Date_02-Jan-20,Posting_Date_02-Nov-19,Posting_Date_02-Oct-19,Posting_Date_02-Sep-19,Posting_Date_02/01/2020,Posting_Date_03-Aug-19,Posting_Date_03-Dec-19,...,Debits_866.02,Debits_900,Debits_9387.93,Credits_250,Credits_300,Credits_4 950.00,Credits_4950,Credits_500,Credits_5000,Credits_6500
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Split the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.20, random_state = 47)

In [79]:
from sklearn.linear_model import LogisticRegression
#Simple Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
lm_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_test, y_test) * 100, 2)
print(acc_log)
cr_log = classification_report(y_test, lm_pred)
print(cr_log)

71.96
                         precision    recall  f1-score   support

          communication       1.00      0.86      0.92        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.53      0.84      0.65        31
                   fees       0.93      0.86      0.89        44
                   food       1.00      0.33      0.50         9
      gifts   donations       0.00      0.00      0.00         2
                   home       1.00      0.08      0.15        12
               interest       0.00      0.00      0.00         1
        life and health       0.00      0.00      0.00         3
                  other       1.00      0.14      0.25         7
  saving and investment       0.00      0.00      0.00         2
      short term credit       1.00      0.43      0.60         7
              transport       0.66      0.93      0.77        56

               accuracy                           0.72       189
              mac

In [78]:
from sklearn.svm import SVC
svc = SVC(gamma='auto')
#Support Vector Machine
svc.fit(X_train, y_train)
svm_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_test, y_test) * 100, 2)
print(acc_svc)
cr_svc = classification_report(y_test, svm_pred)
print(cr_svc)

29.63
                         precision    recall  f1-score   support

          communication       0.00      0.00      0.00        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.00      0.00      0.00        31
                   fees       0.00      0.00      0.00        44
                   food       0.00      0.00      0.00         9
      gifts   donations       0.00      0.00      0.00         2
                   home       0.00      0.00      0.00        12
               interest       0.00      0.00      0.00         1
        life and health       0.00      0.00      0.00         3
                  other       0.00      0.00      0.00         7
  saving and investment       0.00      0.00      0.00         2
      short term credit       0.00      0.00      0.00         7
              transport       0.30      1.00      0.46        56

               accuracy                           0.30       189
              mac

In [77]:
from sklearn.neighbors import KNeighborsClassifier
#K Nearest Neighbours Classifier

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_test, y_test) * 100, 2)
print(acc_knn)
cr_knn = classification_report(y_test, knn_pred)
print(cr_knn)

69.84
                         precision    recall  f1-score   support

          communication       0.61      1.00      0.76        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.47      0.68      0.55        31
                   fees       0.78      0.91      0.84        44
                   food       0.80      0.44      0.57         9
      gifts   donations       0.50      0.50      0.50         2
                   home       0.75      0.25      0.38        12
               interest       0.00      0.00      0.00         1
        life and health       0.00      0.00      0.00         3
                  other       0.25      0.14      0.18         7
  saving and investment       0.00      0.00      0.00         2
      short term credit       1.00      0.43      0.60         7
              transport       0.90      0.80      0.85        56

               accuracy                           0.70       189
              mac

In [76]:
from sklearn.naive_bayes import GaussianNB
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
gnb_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_test, y_test) * 100, 2)
print(acc_gaussian)
cr_gaussian = classification_report(y_test, gnb_pred)
print(cr_gaussian)

70.37
                         precision    recall  f1-score   support

          communication       0.71      0.86      0.77        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.68      0.55      0.61        31
                   fees       0.81      0.86      0.84        44
                   food       0.60      0.33      0.43         9
      gifts   donations       0.25      1.00      0.40         2
                   home       0.78      0.58      0.67        12
               interest       0.00      0.00      0.00         1
        life and health       0.50      0.33      0.40         3
                  other       0.30      0.43      0.35         7
  saving and investment       0.17      0.50      0.25         2
      short term credit       0.57      0.57      0.57         7
              transport       0.92      0.80      0.86        56

               accuracy                           0.70       189
              mac

In [73]:
from sklearn.linear_model import Perceptron
# Perceptron

perceptron = Perceptron(penalty='elasticnet', alpha=0.0001)
perceptron.fit(X_train, y_train)
ptn_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_test, y_test) * 100, 2)
print(acc_perceptron)
cr_perceptron = classification_report(y_test, ptn_pred)
print(cr_perceptron)

79.37
                         precision    recall  f1-score   support

          communication       0.87      0.93      0.90        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.54      1.00      0.70        31
                   fees       0.97      0.86      0.92        44
                   food       0.80      0.44      0.57         9
      gifts   donations       0.67      1.00      0.80         2
    gifts and donations       0.00      0.00      0.00         0
                   home       0.88      0.58      0.70        12
               interest       0.00      0.00      0.00         1
        life and health       0.00      0.00      0.00         3
                  other       0.50      0.43      0.46         7
  saving and investment       0.00      0.00      0.00         2
      short term credit       1.00      0.57      0.73         7
              transport       0.96      0.86      0.91        56

               ac

In [72]:
from sklearn.svm import SVC, LinearSVC
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
lsvc_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_test, y_test) * 100, 2)
print(acc_linear_svc)
cr_linear_svc = classification_report(y_test, lsvc_pred)
print(cr_linear_svc)

84.13
                         precision    recall  f1-score   support

          communication       0.93      1.00      0.97        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.67      0.90      0.77        31
                   fees       0.95      0.89      0.92        44
                   food       0.83      0.56      0.67         9
      gifts   donations       1.00      1.00      1.00         2
    gifts and donations       0.00      0.00      0.00         0
                   home       0.90      0.75      0.82        12
               interest       0.00      0.00      0.00         1
        life and health       1.00      0.67      0.80         3
                  other       0.67      0.57      0.62         7
  saving and investment       0.00      0.00      0.00         2
      short term credit       1.00      0.57      0.73         7
              transport       0.88      0.93      0.90        56

               ac

In [80]:
from sklearn.linear_model import SGDClassifier
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_pred = sgd.predict(X_test)
acc_sgd = round(decision_tree.score(X_test, y_test) * 100, 2)
print(acc_sgd)
cr_sgd = classification_report(y_test, sgd_pred)
print(cr_sgd)

74.6
                         precision    recall  f1-score   support

          communication       0.93      1.00      0.97        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.57      0.97      0.71        31
                   fees       1.00      0.89      0.94        44
                   food       0.80      0.44      0.57         9
      gifts   donations       1.00      1.00      1.00         2
                   home       1.00      0.67      0.80        12
               interest       0.00      0.00      0.00         1
        life and health       1.00      0.33      0.50         3
                  other       0.50      0.86      0.63         7
  saving and investment       0.00      0.00      0.00         2
      short term credit       1.00      0.57      0.73         7
              transport       1.00      0.89      0.94        56

               accuracy                           0.84       189
              macr

In [81]:
from sklearn.tree import DecisionTreeClassifier
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
tree_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_test, y_test) * 100, 2)
print(acc_decision_tree)
cr_decision_tree = classification_report(y_test, tree_pred)
print(cr_decision_tree)

75.66
                         precision    recall  f1-score   support

          communication       1.00      0.79      0.88        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.49      0.97      0.65        31
                   fees       1.00      0.91      0.95        44
                   food       0.75      0.33      0.46         9
      gifts   donations       0.67      1.00      0.80         2
    gifts and donations       0.00      0.00      0.00         0
                   home       0.83      0.42      0.56        12
               interest       0.00      0.00      0.00         1
        life and health       0.00      0.00      0.00         3
                  other       0.29      0.29      0.29         7
  saving and investment       0.00      0.00      0.00         2
      short term credit       1.00      0.57      0.73         7
              transport       0.98      0.82      0.89        56

               ac

In [82]:
from sklearn.ensemble import RandomForestClassifier
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
rf_pred = random_forest.predict(X_test)
random_forest.score(X_test, y_test)
acc_random_forest = round(random_forest.score(X_test, y_test) * 100, 2)
print(acc_random_forest)
cr_random_forest = classification_report(y_test, rf_pred)
print(cr_random_forest)

77.78
                         precision    recall  f1-score   support

          communication       1.00      0.93      0.96        14
              education       0.00      0.00      0.00         1
entertainment   eat out       0.46      1.00      0.63        31
                   fees       1.00      0.86      0.93        44
                   food       1.00      0.33      0.50         9
      gifts   donations       1.00      0.50      0.67         2
                   home       1.00      0.42      0.59        12
               interest       0.00      0.00      0.00         1
        life and health       0.00      0.00      0.00         3
                  other       0.40      0.29      0.33         7
  saving and investment       0.00      0.00      0.00         2
      short term credit       1.00      0.57      0.73         7
              transport       0.94      0.89      0.92        56

               accuracy                           0.78       189
              mac

In [70]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
7,Linear SVC,84.13
5,Perceptron,79.37
3,Random Forest,74.6
6,Stochastic Gradient Decent,74.6
8,Decision Tree,74.6
2,Logistic Regression,71.96
4,Naive Bayes,70.37
1,KNN,69.84
0,Support Vector Machines,29.63


Perhaps we need to work on getting better accuracy for the smaller class which is not often caught. I also think a blended model where we include SVM which was able to correctly identify 'interest', 'education' and 'life and health needs to be considered'

In [94]:
df.head()

Unnamed: 0,Posting_Date,Description,Debits,Credits,Category
0,2020/02/10,mtn data bundle,499.0,,communication
1,2020/02/09,apple com bill cork,74.99,,entertainment eat out
2,2020/02/09,parkhurst hardware parkhurst,75.0,,home
3,2020/02/08,seattle rennies house johannesburg,31.5,,entertainment eat out
4,2020/02/08,flm rennie house eater braamfontein,29.98,,entertainment eat out


In [98]:
df['amounts'] = df[df.columns[3:5]].apply(
    lambda x: ','.join(x.dropna().astype(int)),
    axis=1
)
df

TypeError: ('sequence item 0: expected str instance, int found', 'occurred at index 19')