In [None]:
import pandas as pd
bank_transaction = pd.read_csv("bank_transaction.csv")
user_profile = pd.read_csv("user_profile.csv")

In [None]:
bank_transaction.head()

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Define a basic list of common English stopwords manually
stopwords_list = set([
    "the", "is", "in", "it", "of", "for", "on", "with", "to", "from", "by", "at", 
    "a", "an", "and", "or", "this", "that", "be", "was", "were", "has", "had", 
    "have", "as", "but", "if", "then", "so", "because", "about", "into", "out", 
    "over", "under", "between", "after", "before", "above", "below", "again", 
    "once", "during", "while", "can", "will", "just", "don", "should", "now"
])

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = text.strip()  # Remove extra spaces
        text = " ".join([word for word in text.split() if word not in stopwords_list])  # Remove stopwords
        return text
    return ""

bank_transaction["cleaned_description"] = bank_transaction["description"].apply(clean_text)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(bank_transaction["cleaned_description"])
feature_names = tfidf.get_feature_names_out()
feature_freq = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
feature_df = pd.DataFrame({'feature': feature_names, 'frequency': feature_freq})
threshold = len(bank_transaction) * 0.01  # Adjust as needed
selected_features = feature_df[feature_df["frequency"] >= threshold]["feature"].tolist()
tfidf_filtered = TfidfVectorizer(vocabulary=selected_features)
tfidf_matrix_filtered = tfidf_filtered.fit_transform(bank_transaction["cleaned_description"])
tfidf_df_filtered = pd.DataFrame(tfidf_matrix_filtered.toarray(), columns=selected_features)

processed_df = bank_transaction.drop(columns=["description", "cleaned_description"]).join(tfidf_df_filtered)
processed_df.head()

In [None]:
user_profile.head()

In [None]:
# Convert bool to int in user_profile
from sklearn.preprocessing import OneHotEncoder

user_profile.columns = user_profile.columns.str.lower()
cols = user_profile.select_dtypes(include=['bool']).columns
encoder = OneHotEncoder(drop='if_binary', dtype=int)
encoded_array = encoder.fit_transform(user_profile[cols]).toarray()
new_user_profile = pd.DataFrame(encoded_array, columns=cols)
user_profile[cols] = new_user_profile
user_profile.head()

In [None]:
# Join both datasets to add coeficients
combined = pd.merge(processed_df, user_profile, on='client_id', how='left')
combined.head()

In [None]:
# LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

pd.set_option('future.no_silent_downcasting', True)

combined = combined.dropna(subset=["category"])
combined["category"] = combined["category"].astype("category").cat.codes

xcols = combined.drop(columns=["category", "client_id", "bank_id", "account_id", "txn_id", "txn_date"])
ycol = combined["category"]
x_train, x_test, y_train, y_test = train_test_split(xcols, ycol, test_size=0.25, random_state=42, stratify=ycol)

model = LogisticRegression(solver="saga", max_iter=1000, tol=1e-3, n_jobs=-1)
cv_scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report

pd.set_option('future.no_silent_downcasting', True)

combined = combined.dropna(subset=["category"])
combined["category"] = combined["category"].astype("category").cat.codes

xcols = combined.drop(columns=["category", "client_id", "bank_id", "account_id", "txn_id", "txn_date"])
ycol = combined["category"]
x_train, x_test, y_train, y_test = train_test_split(xcols, ycol, test_size=0.25, random_state=42, stratify=ycol)

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)  # Using 100 trees, parallel processing
cv_scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")

In [None]:
# Pipeline
import pandas as pd
import re
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import csr_matrix

bank_transaction["txn_date"] = pd.to_datetime(bank_transaction["txn_date"])

stopwords_list = set([
    "the", "is", "in", "it", "of", "for", "on", "with", "to", "from", "by", "at", 
    "a", "an", "and", "or", "this", "that", "be", "was", "were", "has", "had", 
    "have", "as", "but", "if", "then", "so", "because", "about", "into", "out", 
    "over", "under", "between", "after", "before", "above", "below", "again", 
    "once", "during", "while", "can", "will", "just", "don", "should", "now"
])

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = text.strip()  # Remove extra spaces
        text = " ".join([word for word in text.split() if word not in stopwords_list])  # Remove stopwords
        return text
    return ""

bank_transaction["cleaned_description"] = bank_transaction["description"].apply(clean_text)

user_profile.columns = user_profile.columns.str.lower()  # Convert column names to lowercase
bool_cols = user_profile.select_dtypes(include=['bool']).columns
encoder = OneHotEncoder(drop='if_binary', dtype=int)
encoded_array = encoder.fit_transform(user_profile[bool_cols]).toarray()
encoded_df = pd.DataFrame(encoded_array, columns=bool_cols)
user_profile[bool_cols] = encoded_df

combined = pd.merge(bank_transaction, user_profile, on="client_id", how="left")
combined = combined.dropna(subset=["category"])
combined["category"] = combined["category"].astype("category").cat.codes

features = ["cleaned_description", "txn_date", "amount"] + list(bool_cols)
X = combined[features]
y = combined["category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

preprocessor = ColumnTransformer([
    ("tfidf", TfidfVectorizer(max_features=100), "cleaned_description"),  # Reduce TF-IDF features
    ("amount", StandardScaler(), ["amount"]),  # Normalize amount
], remainder="drop")

model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1))  # Reduce tree count
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Model Accuracy {accuracy}")
print(f"Classfication report\n{classification_rep}")