In [None]:
# Install necessary libraries
!pip install tensorflow pandas scikit-learn openpyxl  # openpyxl is for reading Excel files




In [2]:
import pandas as pd

# Load the Excel file
file_path = '/content/drive/MyDrive/bankdataset.xlsx'  # Path to the uploaded file
df = pd.read_excel(file_path)

# Display the first few rows of the data
df.head()


Unnamed: 0,Date,Domain,Location,Value,Transaction_count
0,2022-01-01,RESTRAUNT,Bhuj,365554,1932
1,2022-01-01,INVESTMENTS,Ludhiana,847444,1721
2,2022-01-01,RETAIL,Goa,786941,1573
3,2022-01-01,INTERNATIONAL,Mathura,368610,2049
4,2022-01-01,RESTRAUNT,Madurai,615681,1519


In [3]:

# Define numerical and categorical columns
!pip install numpy
import numpy as np
from sklearn.preprocessing import StandardScaler # Import the StandardScaler

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()  # All numerical columns
categorical_cols = df.select_dtypes(include=[object]).columns.tolist()  # All categorical columns

# Ensure all numerical columns are in the correct format (e.g., float or int)
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

# Check for any NaNs introduced by invalid conversions and handle them
df.dropna(inplace=True)  # Optionally handle missing values

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df[categorical_cols], drop_first=True)

# Combine numerical and encoded categorical columns
df_combined = pd.concat([df[numerical_cols], df_encoded], axis=1)

# Scale only the numerical columns
scaler = StandardScaler()
df_combined[numerical_cols] = scaler.fit_transform(df_combined[numerical_cols])

# Convert back to DataFrame for readability (optional)
df_scaled = pd.DataFrame(df_combined, columns=df_combined.columns)

# Example: Dynamically choosing a target based on the prompt
prompt_target = 'Value'  # This can change based on the question
if prompt_target in df_scaled.columns:
    X = df_scaled.drop(prompt_target, axis=1)
    y = df_scaled[prompt_target]

# Convert boolean columns to integers (if applicable)
    X = X.astype(float)  # Ensure all features are floats
    y = y.astype(float)  # Ensure the target is also in float format

    # Now X and y are ready for model training with the specified target.
    print(f"Prepared data for target variable: {prompt_target}")
else:
    print(f"Target variable '{prompt_target}' not found in the DataFrame.")

Prepared data for target variable: Value


In [4]:
import tensorflow as tf

# Convert DataFrame to TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((X, y))

# Shuffle and batch the dataset
batch_size = 32
train_dataset = train_dataset.shuffle(buffer_size=len(X)).batch(batch_size)

# Prefetch to improve performance
train_dataset = train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)


In [5]:
# For binary classification
# Ensure the target is binary (0 or 1)
if len(np.unique(y)) == 2:
    print("Binary Classification detected")
    loss_fn = 'binary_crossentropy'
    final_activation = 'sigmoid'

# For multi-class classification
elif len(np.unique(y)) > 2:
    print("Multi-Class Classification detected")
    from tensorflow.keras.utils import to_categorical
    y = to_categorical(y)  # One-hot encode the target for multi-class
    loss_fn = 'categorical_crossentropy'
    final_activation = 'softmax'

# For regression
else:
    print("Regression problem detected")
    loss_fn = 'mean_squared_error'
    final_activation = None  # No activation needed for regression


Multi-Class Classification detected


In [6]:
import tensorflow as tf
import numpy as np

# Adjust the model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(X.shape[1],)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(y.shape[1], activation='softmax')
])

# Compile the model with a lower learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks for early stopping and learning rate adjustment
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 0.0001 * np.exp(-epoch / 20)  # Example: exponential decay
)




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)  # Adjust learning rate as needed
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(train_dataset, epochs=30)


Epoch 1/30


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None,), output.shape=(None, 2)

In [10]:
print(df.columns)


Index(['Date', 'Domain', 'Location', 'Value', 'Transaction_count'], dtype='object')


In [None]:
import pandas as pd

# Summarize transactions by domain
def summarize_by_domain(df, domain):
    if domain in df['Domain'].values:
        total_value = df[df['Domain'] == domain]['Value'].sum()
        total_count = df[df['Domain'] == domain]['Transaction_count'].sum()
        return f"Total transaction value for domain '{domain}': {total_value}. Total number of transactions: {total_count}."
    else:
        return f"No transactions found for the domain: {domain}."

# Retrieve transactions over a certain amount
def transactions_above_amount(df, amount):
    large_transactions = df[df['Value'] > amount]
    if large_transactions.empty:
        return f"No transactions found above {amount}."
    else:
        return large_transactions[['Date', 'Location', 'Domain', 'Value', 'Transaction_count']].to_string(index=False)

# Summarize transactions for a date range
def summarize_by_date_range(df, start_date, end_date):
    # Ensure the date column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    filtered_transactions = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

    if filtered_transactions.empty:
        return f"No transactions found between {start_date} and {end_date}."
    else:
        total_value = filtered_transactions['Value'].sum()
        total_count = filtered_transactions['Transaction_count'].sum()
        return f"Total transaction value between {start_date} and {end_date} is {total_value}, with {total_count} transactions."


In [11]:
def chatbot(df):
    print("Welcome to the Banking Transactions Chatbot!")
    print("Ask me questions about transaction data.")

    while True:
        user_input = input("\nAsk a question (or type 'exit' to stop): ").lower()

        if "exit" in user_input:
            print("Goodbye!")
            break

        # Handle queries about total transaction by date
        elif "total" in user_input and "date" in user_input:
            date = input("Which date are you asking about? (YYYY-MM-DD format): ")
            try:
                date = pd.to_datetime(date)
                result = summarize_by_date_range(df, date, date)  # Pass df argument
                print(result)
            except ValueError:
                print("Invalid date format. Please enter a valid date in YYYY-MM-DD format.")

        # Handle queries about transactions by location
        elif "transaction" in user_input and "location" in user_input:
            location = input("Which location are you asking about?: ")
            if location in df['Location'].values:
                location_summary = df[df['Location'] == location][['Date', 'Value', 'Transaction_count']].to_string(index=False)
                print(f"Transactions in {location}:\n{location_summary}")
            else:
                print(f"No transactions found for the location: {location}.")

        # Handle queries about transactions by domain
        elif "transaction" in user_input and "domain" in user_input:
            domain = input("Which domain are you asking about (e.g., International, Retail)?: ")
            result = summarize_by_domain(df, domain)
            print(result)

        # Handle requests for transactions above a certain amount
        elif "above" in user_input and "amount" in user_input:
            try:
                amount = float(input("Enter the amount threshold: "))
                result = transactions_above_amount(df, amount)
                print(result)
            except ValueError:
                print("Invalid amount. Please enter a numeric value.")

        # Handle queries about top N transactions
        elif "top" in user_input and "transactions" in user_input:
            try:
                n = int(input("How many top transactions do you want to see?: "))
                top_transactions = df.nlargest(n, 'Value')[['Date', 'Location', 'Domain', 'Value', 'Transaction_count']]
                print(f"Top {n} largest transactions:\n", top_transactions.to_string(index=False))
            except ValueError:
                print("Invalid input. Please enter a valid number.")

        # Handle queries for transactions within a date range
        elif "between" in user_input and "dates" in user_input:
            start_date = input("Enter the start date (YYYY-MM-DD format): ")
            end_date = input("Enter the end date (YYYY-MM-DD format): ")
            try:
                start_date = pd.to_datetime(start_date)
                end_date = pd.to_datetime(end_date)
                result = summarize_by_date_range(df, start_date, end_date)  # Pass df argument
                print(result)
            except ValueError:
                print("Invalid date format. Please enter dates in YYYY-MM-DD format.")

        # General help message if query is unclear
        else:
            print("I can help you with questions like:\n"
                  "- 'What is the total transaction value on a specific date?'\n"
                  "- 'How many transactions occurred in a specific location?'\n"
                  "- 'Show me the top N largest transactions.'\n"
                  "- 'Show me transactions above a certain amount.'\n"
                  "- 'What are the transactions between two dates?'")

chatbot(df)

Welcome to the Banking Transactions Chatbot!
Ask me questions about transaction data.

Ask a question (or type 'exit' to stop): Show me the top N largest transactions
How many top transactions do you want to see?: 3
Top 3 largest transactions:
       Date Location        Domain   Value  Transaction_count
2022-03-24     Pune        PUBLIC 1202271                983
2022-07-20  Vellore     RESTRAUNT 1202271               1482
2022-10-10   Kochin INTERNATIONAL 1202269               2548

Ask a question (or type 'exit' to stop): Show me transactions above a certain amount
Enter the amount threshold: 20000000


NameError: name 'transactions_above_amount' is not defined