# Build User Data Analysis Models
### Expense Categorisation（code template）

#### Import necessary packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### 1.Data preprocessing

In [None]:
# Set data file path
file_path = '......data.csv'

In [None]:
# load data
df = pd.read_csv(file_path)

In [None]:
# Remove unnecessary columns
df.drop('unnecessary_column', axis=1, inplace=True)

# Handling missing values
df['subClass_title'].fillna('Unknown', inplace=True)

# Fill in mean, median, etc.
df['amount'].fillna(df['amount'].mean(), inplace=True)



In [None]:
# Convert data type
df['date'] = pd.to_datetime(df['date'])


In [None]:
# Create new columns or modify existing columns
# For example
df['year'] = df['date'].dt.year


In [None]:
# Text data cleaning (preparing for NLP processing)
df['description'] = df['description'].str.lower()  # Convert to lowercase
df['description'] = df['description'].str.replace('[^\w\s]', '')  # Remove special characters


In [None]:
# Output the processed data frame header
print(df.head())

#### 2.Preliminary classification based on subClass_title

In [None]:
# Create a new column 'category' to store the classification results
df['category'] = df['subClass_title']


In [None]:
# Optionally classify 'Unknown' as 'Other' if there is an 'Unknown' or NaN value in subClass_title

df['category'].replace('Unknown', 'Other', inplace=True)

In [None]:
# "Now, a new column 'category' has been added to df,
# which contains classification results based on subClass_title（subClass_title contains the predefined classification name）.
# Print out some data to check the classification results."
print(df[['subClass_title', 'category']].head())

#### 3.Perform natural language processing on records with missing subClass_title

In [None]:
# Split the data set into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    training_df['description'], training_df['category'], test_size=0.3, random_state=42)


In [None]:
# Convert text to numeric vector
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Train a classification model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [None]:
# Test model performance
y_pred = model.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

In [None]:
# Use model to classify data with missing subClass_title
missing_subclass = df[df['subClass_title'].isna()]
descriptions = missing_subclass['description']
descriptions_vec = vectorizer.transform(descriptions)
predicted_categories = model.predict(descriptions_vec)

In [None]:
# Merge predictions back into original data frame
missing_subclass['predicted_category'] = predicted_categories
df.update(missing_subclass)