In [5]:
import pandas as pd

In [6]:
# Read the user_transactions.csv
df = pd.read_csv('user_transactions.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5947 entries, 0 to 5946
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id_              5947 non-null   int64 
 1   t_date           5947 non-null   object
 2   t_type           5947 non-null   object
 3   t_amt            5947 non-null   int64 
 4   has_credit_card  5947 non-null   object
 5   account_type     5947 non-null   object
dtypes: int64(2), object(4)
memory usage: 278.9+ KB


In [7]:
# Check for null values in the DataFrame
null_counts = df.isnull().sum()
print("Null Value Counts:")
print(null_counts)

# Remove rows with null values
df = df.dropna()

# Display the cleaned DataFrame
print("\nDataFrame after removing rows with null values:")
print(df)

Null Value Counts:
id_                0
t_date             0
t_type             0
t_amt              0
has_credit_card    0
account_type       0
dtype: int64

DataFrame after removing rows with null values:
      id_      t_date   t_type  t_amt has_credit_card account_type
0     312  20-01-2020  deposit    485             Yes      Savings
1     376  03-01-2020  deposit    706              No      Current
2     188  13-01-2020  deposit    601              No      Savings
3     138  11-01-2020  deposit    520              No       Salary
4     373  18-01-2020  deposit    596              No       Salary
...   ...         ...      ...    ...             ...          ...
5942  281  06-01-2020  deposit    616             Yes      Current
5943  391  15-01-2020  deposit    219             Yes      Current
5944   45  14-01-2020  deposit    650              No      Current
5945   49  04-01-2020  deposit    432              No      Savings
5946  473  17-01-2020  deposit    657              No   

In [8]:
# Check for duplicate rows in the DataFrame
duplicate_count = df.duplicated().sum()
print("Duplicate Rows Count:", duplicate_count)

# Remove duplicate rows
df = df.drop_duplicates()

# Display the cleaned DataFrame
print("\nDataFrame after removing duplicates:")
print(df)

Duplicate Rows Count: 79

DataFrame after removing duplicates:
      id_      t_date   t_type  t_amt has_credit_card account_type
0     312  20-01-2020  deposit    485             Yes      Savings
1     376  03-01-2020  deposit    706              No      Current
2     188  13-01-2020  deposit    601              No      Savings
3     138  11-01-2020  deposit    520              No       Salary
4     373  18-01-2020  deposit    596              No       Salary
...   ...         ...      ...    ...             ...          ...
5863  155  10-01-2020  deposit    712             Yes      Savings
5864  398  01-01-2020  deposit    196              No      Current
5865  255  14-01-2020  deposit    563             Yes      Savings
5866  185  29-01-2020  deposit    626             Yes      Savings
5867  309  13-01-2020  deposit    995             Yes      Savings

[5868 rows x 6 columns]


In [9]:
# Data Cleaning
# Step 1: Eliminate specified columns
columns_to_remove = ["has_credit_card", "account_type"]
df = df.drop(columns=columns_to_remove)

# Step 2: Rename columns
df = df.rename(columns={
    "id_": "consumer_id",
    "t_date": "transaction_date",
    "t_type": "transaction_type",
    "t_amt": "transaction_amount"
})

# Step 3: Save the cleaned DataFrame to a CSV file without the index column
df.to_csv("user_transaction_cleaned.csv", index=False)

# Return the cleaned DataFrame for further analysis
print("Cleaned DataFrame:")
print(df)

print("\nData has been cleaned and saved as 'user_transaction_cleaned.csv'")

Cleaned DataFrame:
      consumer_id transaction_date transaction_type  transaction_amount
0             312       20-01-2020          deposit                 485
1             376       03-01-2020          deposit                 706
2             188       13-01-2020          deposit                 601
3             138       11-01-2020          deposit                 520
4             373       18-01-2020          deposit                 596
...           ...              ...              ...                 ...
5863          155       10-01-2020          deposit                 712
5864          398       01-01-2020          deposit                 196
5865          255       14-01-2020          deposit                 563
5866          185       29-01-2020          deposit                 626
5867          309       13-01-2020          deposit                 995

[5868 rows x 4 columns]

Data has been cleaned and saved as 'user_transaction_cleaned.csv'
