In [4]:
import pandas as pd

# === 1. Läs in CSV ===
file_path = r"/Bank_transntioner/data/transactions.csv"
df = pd.read_csv(file_path)

# === 2. Typkonvertering ===
# Ta bort eventuella mellanslag i amount och konvertera till float
df["amount"] = df["amount"].astype(str).str.replace(" ", "").astype(float)

# Säkerställ att transaction_id är str
df["transaction_id"] = df["transaction_id"].astype(str)

# Konvertera timestamp till datetime
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# === 3. Saknade värden ===
# Fyll saknade notes med "No note"
df["notes"].fillna("No note", inplace=True)

# Fyll saknade kommuner/länder med "Unknown"
cols_to_fill = [
    "sender_country",
    "receiver_country",
    "sender_municipality",
    "receiver_municipality",
]
for col in cols_to_fill:
    df[col].fillna("Unknown", inplace=True)

# === 4. Felaktiga värden ===
# Flagga misstänkt amount (≤ 0 eller > 1 miljon)
df["suspicious_amount"] = (df["amount"] <= 0) | (df["amount"] > 1_000_000)

# Flagga ogiltig transaction_type
valid_types = ["incoming", "outgoing"]
df["invalid_transaction_type"] = ~df["transaction_type"].isin(valid_types)

# Godkänn alla 10 valutor som finns i datan
valid_currencies = ["SEK", "USD", "EUR", "NOK", "DKK", "RMB", "ZAR", "GBP", "JPY", "ZMW"]
df["invalid_currency"] = ~df["currency"].isin(valid_currencies)

# === 5. Rensa formatproblem (konton) ===
df["sender_account"] = df["sender_account"].astype(str).str.strip()
df["receiver_account"] = df["receiver_account"].astype(str).str.strip()

# === 6. Summering (frivilligt för utskrift) ===
print("Antal transaktioner:", len(df))
print("Saknade timestamps:", df['timestamp'].isnull().sum())
print("Misstänkta belopp:", df['suspicious_amount'].sum())
print("Ogiltiga transaction_type:", df['invalid_transaction_type'].sum())
print("Ogiltiga valutor:", df['invalid_currency'].sum())

# === 7. Spara till ny CSV ===
df.to_csv("transactions_cleaned.csv", index=False)
print(" Rensad fil sparad som 'transactions_cleaned.csv'")



Antal transaktioner: 100000
Saknade timestamps: 0
Misstänkta belopp: 0
Ogiltiga transaction_type: 0
Ogiltiga valutor: 0
 Rensad fil sparad som 'transactions_cleaned.csv'


In [5]:
import pandas as pd

# Läs in CSV-filen
transactions_cleaned = pd.read_csv('data/transactions_cleaned.csv')

# Visa kolumner innan ändring (valfritt)
print("Kolumner före borttagning:")
print(transactions_cleaned.columns)

# Ta bort de tre sista kolumnerna
transactions_cleaned = transactions_cleaned.iloc[:, :-3]

# Visa kolumner efter ändring (valfritt)
print("Kolumner efter borttagning:")
print(transactions_cleaned.columns)

# Spara den uppdaterade DataFrame till en ny fil (valfritt)
transactions_cleaned.to_csv('transactions_cleaned_trimmed.csv', index=False)


Kolumner före borttagning:
Index(['transaction_id', 'timestamp', 'amount', 'currency', 'sender_account',
       'receiver_account', 'sender_country', 'sender_municipality',
       'receiver_country', 'receiver_municipality', 'transaction_type',
       'notes', 'suspicious_amount', 'invalid_transaction_type',
       'invalid_currency'],
      dtype='object')
Kolumner efter borttagning:
Index(['transaction_id', 'timestamp', 'amount', 'currency', 'sender_account',
       'receiver_account', 'sender_country', 'sender_municipality',
       'receiver_country', 'receiver_municipality', 'transaction_type',
       'notes'],
      dtype='object')
