In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("CreditCardData.csv")

In [3]:
df.head(20)

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1
5,#3446 698,13-Oct-20,Tuesday,20,MasterCard,Tap,£30,POS,Children,India,India,India,M,48.4,Monzo,0
6,#3652 191,13-Oct-20,Tuesday,18,Visa,CVC,£231,Online,Children,United Kingdom,United Kingdom,United Kingdom,M,39.5,Barclays,0
7,#3161 927,13-Oct-20,Tuesday,18,MasterCard,CVC,£154,Online,Services,USA,USA,United Kingdom,M,37.8,HSBC,0
8,#3025 809,13-Oct-20,Tuesday,23,MasterCard,PIN,£39,ATM,Fashion,Russia,Russia,United Kingdom,F,43.3,Metro,0
9,#3413 696,14-Oct-20,Wednesday,23,MasterCard,Tap,£17,POS,Entertainment,India,India,India,M,69.9,Barlcays,0


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# Fill missing values in the 'Merchant Group' column with a placeholder
df['Merchant Group'].fillna('Unknown', inplace=True)

# Filter top 2000 records
df_top_2000 = df.head(2000)

# Filter transactions with merchant group 'Entertainment'
entertainment_transactions = df_top_2000[df_top_2000['Merchant Group'] == 'Entertainment']
df_top_2000

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,#2687 300,14-Oct-20,Wednesday,23,MasterCard,PIN,£21,ATM,Products,China,China,China,F,39.6,Halifax,0
1996,#2867 705,14-Oct-20,Wednesday,23,Visa,PIN,£22,ATM,Electronics,United Kingdom,United Kingdom,United Kingdom,M,47.1,Barlcays,0
1997,#2988 629,14-Oct-20,Wednesday,10,Visa,Tap,£9,POS,Products,United Kingdom,United Kingdom,United Kingdom,F,32.3,Barclays,0
1998,#2635 065,14-Oct-20,Wednesday,19,Visa,PIN,£22,ATM,Subscription,USA,United Kingdom,USA,F,58.6,Barclays,0


In [6]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
merchant_group_vectors = vectorizer.fit_transform(df_top_2000['Merchant Group'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(merchant_group_vectors, merchant_group_vectors)

# Get indices of 'Entertainment' transactions
entertainment_indices = entertainment_transactions.index

# Find similar transactions
similar_transactions = []
for idx in entertainment_indices:
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similar_indices = [x[0] for x in similarity_scores_sorted[1:11]]  # Exclude the transaction itself
    similar_transactions.extend(similar_indices)


In [7]:
# Remove duplicate indices and keep only unique ones
similar_transactions = list(set(similar_transactions))

# Print details of similar transactions
print("10 similar transactions based on merchant group 'Entertainment':")
for idx in similar_transactions:
    print(df_top_2000.iloc[idx])


10 similar transactions based on merchant group 'Entertainment':
Transaction ID                 #2973 712
Date                           14-Oct-20
Day of Week                    Wednesday
Time                                  13
Type of Card                  MasterCard
Entry Mode                           CVC
Amount                              £296
Type of Transaction               Online
Merchant Group             Entertainment
Country of Transaction    United Kingdom
Shipping Address          United Kingdom
Country of Residence      United Kingdom
Gender                                 F
Age                                 41.8
Bank                            Barlcays
Fraud                                  0
Name: 65, dtype: object
Transaction ID                 #2640 960
Date                           13-Oct-20
Day of Week                      Tuesday
Time                                  14
Type of Card                        Visa
Entry Mode                           Tap
Amount   

In [19]:
# Fill missing values in the 'Entry Mode' column with a placeholder
df['Entry Mode'].fillna('Unknown', inplace=True)

# Filter top 2000 records
df_top_2000 = df.head(2000)

# Filter transactions with merchant group 'CVC'
cvc_transactions = df_top_2000[df_top_2000['Entry Mode'] == 'CVC']
df_top_2000

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,#2687 300,14-Oct-20,Wednesday,23,MasterCard,PIN,£21,ATM,Products,China,China,China,F,39.6,Halifax,0
1996,#2867 705,14-Oct-20,Wednesday,23,Visa,PIN,£22,ATM,Electronics,United Kingdom,United Kingdom,United Kingdom,M,47.1,Barlcays,0
1997,#2988 629,14-Oct-20,Wednesday,10,Visa,Tap,£9,POS,Products,United Kingdom,United Kingdom,United Kingdom,F,32.3,Barclays,0
1998,#2635 065,14-Oct-20,Wednesday,19,Visa,PIN,£22,ATM,Subscription,USA,United Kingdom,USA,F,58.6,Barclays,0


In [20]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
entry_group_vectors = vectorizer.fit_transform(df_top_2000['Entry Mode'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(entry_group_vectors, entry_group_vectors)

# Get indices of 'CVC' transactions
cvc_indices = cvc_transactions.index

# Find similar transactions
similar_transactions = []
for idx in cvc_indices:
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similar_indices = [x[0] for x in similarity_scores_sorted[1:11]]  # Exclude the transaction itself
    similar_transactions.extend(similar_indices)

In [21]:
# Remove duplicate indices and keep only unique ones
similar_transactions = list(set(similar_transactions))

# Print details of similar transactions
print("10 similar transactions based on a credit/debit entry mode 'CVC':")
for idx in similar_transactions:
    print(df_top_2000.iloc[idx])

10 similar transactions based on a credit/debit entry mode 'CVC':
Transaction ID                 #3652 191
Date                           13-Oct-20
Day of Week                      Tuesday
Time                                  18
Type of Card                        Visa
Entry Mode                           CVC
Amount                              £231
Type of Transaction               Online
Merchant Group                  Children
Country of Transaction    United Kingdom
Shipping Address          United Kingdom
Country of Residence      United Kingdom
Gender                                 M
Age                                 39.5
Bank                            Barclays
Fraud                                  0
Name: 6, dtype: object
Transaction ID                 #3161 927
Date                           13-Oct-20
Day of Week                      Tuesday
Time                                  18
Type of Card                  MasterCard
Entry Mode                           CVC
Amount   

In [22]:
# Fill missing values in the 'Type of Transaction' column with a placeholder
df['Type of Transaction'].fillna('Unknown', inplace=True)

# Filter top 2000 records
df_top_2000 = df.head(2000)

# Filter transactions with type of transaction 'Online'
online_transactions = df_top_2000[df_top_2000['Type of Transaction'] == 'Online']
df_top_2000

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,#2687 300,14-Oct-20,Wednesday,23,MasterCard,PIN,£21,ATM,Products,China,China,China,F,39.6,Halifax,0
1996,#2867 705,14-Oct-20,Wednesday,23,Visa,PIN,£22,ATM,Electronics,United Kingdom,United Kingdom,United Kingdom,M,47.1,Barlcays,0
1997,#2988 629,14-Oct-20,Wednesday,10,Visa,Tap,£9,POS,Products,United Kingdom,United Kingdom,United Kingdom,F,32.3,Barclays,0
1998,#2635 065,14-Oct-20,Wednesday,19,Visa,PIN,£22,ATM,Subscription,USA,United Kingdom,USA,F,58.6,Barclays,0


In [36]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
transaction_group_vectors = vectorizer.fit_transform(df_top_2000['Type of Transaction'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(transaction_group_vectors, transaction_group_vectors)

# Get indices of 'Online' transactions
online_indices = online_transactions.index

# Find similar transactions
similar_transactions = []
for idx in online_indices:
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similar_indices = [x[0] for x in similarity_scores_sorted[1:11]]  # Exclude the transaction itself
    similar_transactions.extend(similar_indices)

In [37]:
# Remove duplicate indices and keep only unique ones
similar_transactions = list(set(similar_transactions))

# Print details of similar transactions
print("10 similar transactions based on the type of transaction 'Online':")
for idx in similar_transactions:
    print(df_top_2000.iloc[idx])

10 similar transactions based on the type of transaction 'Online':
Transaction ID                 #3652 191
Date                           13-Oct-20
Day of Week                      Tuesday
Time                                  18
Type of Card                        Visa
Entry Mode                           CVC
Amount                              £231
Type of Transaction               Online
Merchant Group                  Children
Country of Transaction    United Kingdom
Shipping Address          United Kingdom
Country of Residence      United Kingdom
Gender                                 M
Age                                 39.5
Bank                            Barclays
Fraud                                  0
Name: 6, dtype: object
Transaction ID                 #3161 927
Date                           13-Oct-20
Day of Week                      Tuesday
Time                                  18
Type of Card                  MasterCard
Entry Mode                           CVC
Amount  

In [27]:
# Fill missing values in the 'Type of Transaction' column with a placeholder
df['Type of Card'].fillna('Unknown', inplace=True)

# Filter top 2000 records
df_top_2000 = df.head(2000)

# Filter transactions with type of transaction 'Online'
card_transactions = df_top_2000[df_top_2000['Type of Card'] == 'MasterCard']
df_top_2000

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,#2687 300,14-Oct-20,Wednesday,23,MasterCard,PIN,£21,ATM,Products,China,China,China,F,39.6,Halifax,0
1996,#2867 705,14-Oct-20,Wednesday,23,Visa,PIN,£22,ATM,Electronics,United Kingdom,United Kingdom,United Kingdom,M,47.1,Barlcays,0
1997,#2988 629,14-Oct-20,Wednesday,10,Visa,Tap,£9,POS,Products,United Kingdom,United Kingdom,United Kingdom,F,32.3,Barclays,0
1998,#2635 065,14-Oct-20,Wednesday,19,Visa,PIN,£22,ATM,Subscription,USA,United Kingdom,USA,F,58.6,Barclays,0


In [30]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
card_group_vectors = vectorizer.fit_transform(df_top_2000['Type of Card'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(card_group_vectors, card_group_vectors)

# Get indices of 'MasterCard' transactions
card_indices = card_transactions.index

# Find similar transactions
similar_transactions = []
for idx in card_indices:
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similar_indices = [x[0] for x in similarity_scores_sorted[1:11]]  # Exclude the transaction itself
    similar_transactions.extend(similar_indices)

In [32]:
# Remove duplicate indices and keep only unique ones
similar_transactions = list(set(similar_transactions))

# Print details of similar transactions
print("10 similar transactions based on the type of card 'MasterCard':")
for idx in similar_transactions:
    print(df_top_2000.iloc[idx])

10 similar transactions based on the type of card 'MasterCard':
Transaction ID             #3446 698
Date                       13-Oct-20
Day of Week                  Tuesday
Time                              20
Type of Card              MasterCard
Entry Mode                       Tap
Amount                           £30
Type of Transaction              POS
Merchant Group              Children
Country of Transaction         India
Shipping Address               India
Country of Residence           India
Gender                             M
Age                             48.4
Bank                           Monzo
Fraud                              0
Name: 5, dtype: object
Transaction ID                 #3161 927
Date                           13-Oct-20
Day of Week                      Tuesday
Time                                  18
Type of Card                  MasterCard
Entry Mode                           CVC
Amount                              £154
Type of Transaction               

In [33]:
# Fill missing values in the 'Type of Transaction' column with a placeholder
df['Type of Card'].fillna('Unknown', inplace=True)

# Filter top 2000 records
df_top_2000 = df.head(2000)

# Filter transactions with type of transaction 'Online'
card_transactions = df_top_2000[df_top_2000['Type of Card'] == 'Visa']
df_top_2000

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,#2687 300,14-Oct-20,Wednesday,23,MasterCard,PIN,£21,ATM,Products,China,China,China,F,39.6,Halifax,0
1996,#2867 705,14-Oct-20,Wednesday,23,Visa,PIN,£22,ATM,Electronics,United Kingdom,United Kingdom,United Kingdom,M,47.1,Barlcays,0
1997,#2988 629,14-Oct-20,Wednesday,10,Visa,Tap,£9,POS,Products,United Kingdom,United Kingdom,United Kingdom,F,32.3,Barclays,0
1998,#2635 065,14-Oct-20,Wednesday,19,Visa,PIN,£22,ATM,Subscription,USA,United Kingdom,USA,F,58.6,Barclays,0


In [34]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
card_group_vectors = vectorizer.fit_transform(df_top_2000['Type of Card'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(card_group_vectors, card_group_vectors)

# Get indices of 'Visa' transactions
card_indices = card_transactions.index

# Find similar transactions
similar_transactions = []
for idx in card_indices:
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores_sorted = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similar_indices = [x[0] for x in similarity_scores_sorted[1:11]]  # Exclude the transaction itself
    similar_transactions.extend(similar_indices)

In [35]:
# Remove duplicate indices and keep only unique ones
similar_transactions = list(set(similar_transactions))

# Print details of similar transactions
print("10 similar transactions based on the type of card 'Visa':")
for idx in similar_transactions:
    print(df_top_2000.iloc[idx])

10 similar transactions based on the type of card 'Visa':
Transaction ID             #2694 780
Date                       14-Oct-20
Day of Week                Wednesday
Time                              14
Type of Card                    Visa
Entry Mode                       Tap
Amount                            £5
Type of Transaction              POS
Merchant Group            Restaurant
Country of Transaction         India
Shipping Address               India
Country of Residence           India
Gender                             F
Age                             42.2
Bank                        Barclays
Fraud                              0
Name: 2, dtype: object
Transaction ID                 #2640 960
Date                           13-Oct-20
Day of Week                      Tuesday
Time                                  14
Type of Card                        Visa
Entry Mode                           Tap
Amount                               £28
Type of Transaction                  POS