In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load data from CSV file
file_path = 'expenses.csv'
df = pd.read_csv(file_path)

# Ensure Date column is of datetime type
df['Date'] = pd.to_datetime(df['Date'])

# Preprocess data
df['Category'] = df['Category'].astype('category')
category_codes = df['Category'].cat.codes
X = df[['Date', 'Amount']]
y = category_codes
X['Date'] = X['Date'].apply(lambda x: x.toordinal())

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_log_reg)}")

# Train SVM
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm)}")

# Train Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred_gbc = gbc.predict(X_test)
print(f"Gradient Boosting Classifier Accuracy: {accuracy_score(y_test, y_pred_gbc)}")

# Model Evaluation
def evaluate_model(y_test, y_pred, model_name):
    print(f"--- {model_name} ---")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=df['Category'].cat.categories, yticklabels=df['Category'].cat.categories)
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

evaluate_model(y_test, y_pred_log_reg, "Logistic Regression")
evaluate_model(y_test, y_pred_svm, "SVM")
evaluate_model(y_test, y_pred_gbc, "Gradient Boosting Classifier")

# Visualize expense distribution
expense_distribution = df.groupby('Category')['Amount'].sum()

# Pie Chart
expense_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Expense Distribution')
plt.ylabel('')
plt.show()

# Bar Chart
expense_distribution.plot(kind='bar', color='skyblue')
plt.title('Expense Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Total Amount')
plt.xticks(ticks=range(len(df['Category'].cat.categories)), labels=df['Category'].cat.categories, rotation=45)
plt.show()

# Box Plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Category', y='Amount', data=df)
plt.title('Expense Amount Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Amount')
plt.xticks(rotation=45)
plt.show()

# Time Series Plot
df_time_series = df.groupby('Date')['Amount'].sum().reset_index()

plt.figure(figsize=(14, 7))
plt.plot(df_time_series['Date'], df_time_series['Amount'], marker='o', linestyle='-', color='b')
plt.title('Total Expenses Over Time')
plt.xlabel('Date')
plt.ylabel('Total Amount')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()