In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Jan-Dec Dataset.csv")

In [7]:
df.head()

Unnamed: 0,StartOfMonth,Season,State,Average State Income,Contact Method,Age Group,Gender,Scam Category,Scam Type,Amount_lost,Number_of_reports
0,1/1/2024,Summer,South Australia,1776.9,Text message,Under 18,Female,Attempts to gain your personal information,Phishing,$3.50,2
1,1/3/2024,Autumn,Australian Capital Territory,2126.5,Text message,18 - 24,Female,Attempts to gain your personal information,Phishing,$7197.00,2
2,1/2/2024,Summer,Northern Territory,1845.8,Phone call,45 - 54,Female,Attempts to gain your personal information,Phishing,$450.00,3
3,1/5/2024,Autumn,Australian Capital Territory,2126.5,Phone call,18 - 24,Female,Attempts to gain your personal information,Phishing,$70000.00,2
4,1/6/2024,Winter,Australian Capital Territory,2126.5,Email,35 - 44,Female,Attempts to gain your personal information,Phishing,$24688.00,5


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Drop unused column
df = df.drop(columns=["Season"])

# One-hot encode 'State'
df = pd.get_dummies(df, columns=["State"], drop_first=True)

# Ordinal encode Age Group
age_order = [["Under 18", "18 - 24", "25 - 34", "35 - 44", "45 - 54", "55 - 64", "65 and over"]]
encoder = OrdinalEncoder(categories=age_order)
df["Age Group"] = encoder.fit_transform(df[["Age Group"]])

# Normalize categorical columns
df["Scam Category"] = df["Scam Category"].map(df["Scam Category"].value_counts(normalize=True))
df["Contact Method"] = df["Contact Method"].map(df["Contact Method"].value_counts(normalize=True))
df["Scam Type"] = df["Scam Type"].map(df["Scam Type"].value_counts(normalize=True))

# Encode Gender
df["Gender"] = df["Gender"].map({"Male": 0, "Female": 1})

# Convert boolean to int (if needed)
df = df.astype({col: int for col in df.select_dtypes(include=["bool"]).columns})

# Clean Amount_lost
df["Amount_lost"] = df["Amount_lost"].replace(r'[\$,]', '', regex=True).astype(float)

# Define X and y
X = df.drop(columns=["Number_of_reports", "StartOfMonth"])
y = df["Number_of_reports"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_test_original = X_test.copy()

# Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

X_test_original["Actual_Number_of_Reports"] = y_test.values
X_test_original["Predicted_Number_of_Reports"] = y_pred

X_test_original.to_csv("predicted_dataset.csv", index=False)

print("Sukses")

Sukses


In [11]:
print("Contact Method Mapping:")
for method, val in contact_method_map.items():
    print(f"{val:.6f} → {method}")

Contact Method Mapping:
0.234301 → Email
0.203304 → Text message
0.196909 → Phone call
0.141753 → Social media/Online forums
0.099609 → Internet
0.062972 → Mobile apps
0.033085 → Mail
0.026024 → In person
0.001066 → unspecified
0.000977 → Fax


In [13]:
print("Scam Type Mapping:")
for scamtype, val in scam_type_map.items():
    print(f"{val:.6f} → {scamtype}")

Scam Type Mapping:
0.099654 → Phishing
0.098499 → Other
0.097611 → Online shopping
0.092726 → False billing
0.087752 → Identity theft
0.072875 → Classified
0.065103 → Hacking
0.057332 → Investment
0.045608 → Remote access
0.035438 → Dating and romance
0.033085 → Threats to life, arrest or other
0.032507 → Rebate
0.028244 → Jobs and employment
0.027267 → Travel, prizes and lottery
0.026557 → Overpayment
0.024203 → inheritance and unexpected money
0.023759 → Health and medical products
0.016653 → Ransomware and malware
0.016520 → Mobile premium services
0.008837 → Fake charity
0.005729 → Betting and sports investment
0.002132 → Pyramid schemes
0.001910 → Psychic and clairvoyant
