In [3]:
import csv
import pandas as pd
import sklearn.svm as svm
import numpy as np
import nltk.tokenize as tk
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer # allows us to encode text data for ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split # used to split data in to test and validation sets
from sklearn.metrics import accuracy_score # calculates accuracy of our model
from nltk.stem import WordNetLemmatizer # word net of english language used for lemmatization


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

stop_words = ['PHOENIX', 'AZ', 'AT', 'AS', 'SCOTTSDALE', 'CITUT', 'NY']


def import_data():
    df = pd.read_csv('disc_transactions.CSV')
    return df

"""
Clean data by removing any non Alpha characters and
Only keeping transactions that are greater than 0. These
represent purchases. Transactions less than 0 represent
payments to the credit card company
"""
def clean_data(df):
    df_clean = df.loc[df['Amount'] > 0]
    df_clean.loc[:,'Description'] = df_clean.Description.str.replace('[^a-zA-Z]', ' ')
    df_clean['Description'] = df_clean['Description'].str.lower()
    return df_clean

"""
Encode description data using bag of words approach
Each word in the text will now become a feature, increasing our feature count significantly
"""
def encode_data(X):
    count_vec = CountVectorizer(stop_words=stop_words)
    x_encoded = count_vec.fit_transform(X.Description).toarray()
    encode_df = pd.DataFrame(x_encoded)
    return encode_df

def encode_data_tfid(X):
    tfid = TfidfVectorizer(stop_words='english')
    x_encoded = tfid.fit_transform(X.Description).toarray()
    encoded_df = pd.DataFrame(x_encoded)
    return encoded_df

""" Split data into predictor variables and target variables """
def split_input_output(data):
    X = data[['Description', 'Amount']]
    y = data['Category']
    return X, y


"""
Lemmatize the words in the description
"""
def lemm_x(X):
    lemm = WordNetLemmatizer()
    tokenize = tk.WhitespaceTokenizer()
    for i, r in X.iterrows():
        desc = r['Description']
        r['Description'] = [lemm.lemmatize(w) for w in tokenize.tokenize(desc)]
    return X

df = import_data()

df_clean = clean_data(df)
df_clean = df_clean.reset_index()
X, y = split_input_output(df_clean)
y = y.reset_index()
X_encoded = encode_data_tfid(X)

"""
Split the data into train and test sets
We are using a 30/70 split.
33.33% of data will be training
67% will be testing data
"""

x_train, x_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.33)

# create the model
transaction_classifier = svm.SVC(kernel='rbf', decision_function_shape='ovr', C=5, gamma='scale')
# fit the model
transaction_classifier.fit(x_train, y_train['Category'])

# run standard prediction
transaction_prediction = transaction_classifier.predict(x_test)
display(transaction_prediction)
# print the accuracy of the model
accuracy_score(y_test['Category'], transaction_prediction)



  df_clean.loc[:,'Description'] = df_clean.Description.str.replace('[^a-zA-Z]', ' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Description'] = df_clean['Description'].str.lower()


array(['Merchandise', 'Restaurants', 'Restaurants', 'Merchandise',
       'Restaurants', 'Restaurants', 'Restaurants', 'Services',
       'Restaurants', 'Restaurants', 'Supermarkets', 'Restaurants',
       'Merchandise', 'Restaurants', 'Restaurants', 'Education',
       'Restaurants', 'Gasoline', 'Supermarkets', 'Restaurants',
       'Gasoline', 'Restaurants', 'Supermarkets', 'Gasoline', 'Education',
       'Restaurants', 'Supermarkets', 'Restaurants', 'Gasoline',
       'Restaurants', 'Restaurants', 'Travel/ Entertainment',
       'Restaurants', 'Restaurants', 'Department Stores', 'Gasoline',
       'Supermarkets', 'Restaurants', 'Merchandise', 'Restaurants',
       'Restaurants', 'Restaurants', 'Restaurants', 'Restaurants',
       'Merchandise', 'Supermarkets', 'Gasoline', 'Supermarkets',
       'Restaurants', 'Restaurants', 'Restaurants', 'Merchandise',
       'Gasoline', 'Merchandise', 'Travel/ Entertainment', 'Supermarkets',
       'Merchandise', 'Department Stores', 'Travel/ Ente

0.9117647058823529