In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Jan-Dec Dataset.csv")

In [2]:
print(df.isnull().sum())

StartOfMonth                0
Season                      0
State                       0
Average State Income        0
Contact Method              0
Age Group                   0
Gender                      0
Scam Category               0
Scam Type                   0
Amount_lost             14039
Number_of_reports           0
dtype: int64


In [3]:
df["Amount_lost"] = df["Amount_lost"].fillna(0)

In [4]:
df = df.drop(columns=["Season"])

In [5]:
df = pd.get_dummies(df, columns=["State"], drop_first=True)

In [6]:
from sklearn.preprocessing import OrdinalEncoder
age_order = [["18 - 24", "25 - 34", "35 - 44", "45 - 54", "55 - 64", "65 and over", "Under 18"]]
encoder = OrdinalEncoder(categories=age_order)
df["Age Group"] = encoder.fit_transform(df[["Age Group"]])

In [7]:
contact_method_map = df["Contact Method"].value_counts(normalize=True).to_dict()
scam_cat_map = df["Scam Category"].value_counts(normalize=True).to_dict()
scam_type_map = df["Scam Type"].value_counts(normalize=True).to_dict()

In [8]:
df["Scam Category"] = df["Scam Category"].map(scam_cat_map)
df["Contact Method"] = df["Contact Method"].map(contact_method_map)
df["Scam Type"] = df["Scam Type"].map(scam_type_map)

In [9]:
df["Gender"] = df["Gender"].map({"Male": 0, "Female": 1})

In [10]:
df = df.astype({col: int for col in df.select_dtypes(include=["bool"]).columns})

In [11]:
df["Amount_lost"] = df["Amount_lost"].replace(r'[\$,]', '', regex=True).astype(float)

In [12]:
df = df.drop(columns=["Gender", "Scam Category", "StartOfMonth"])

In [13]:
X = df.drop(columns=["Number_of_reports"])
y = df["Number_of_reports"]

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
original_test = df.iloc[X_test.index].copy()

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

In [18]:
gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

In [19]:
gbr_model.fit(X_train, y_train)

In [20]:
y_pred = gbr_model.predict(X_test)

In [21]:
original_test["Predicted_Number_of_Reports"] = y_pred

In [22]:
original_test["Actual_Number_of_Reports"] = y_test.values
original_test["Predicted_Number_of_Reports"] = y_pred

In [23]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 60.12235979841036
R2 Score: 0.5658620980529085


In [24]:
original_test.to_csv("NEW_GBoost_Predicted.csv", index=False)

In [25]:
print("Contact Method Mapping:")
for method, val in contact_method_map.items():
    print(f"{val:.6f} → {method}")

Contact Method Mapping:
0.234301 → Email
0.203304 → Text message
0.196909 → Phone call
0.141753 → Social media/Online forums
0.099609 → Internet
0.062972 → Mobile apps
0.033085 → Mail
0.026024 → In person
0.001066 → unspecified
0.000977 → Fax


In [26]:
print("Scam Type Mapping:")
for scamtype, val in scam_type_map.items():
    print(f"{val:.6f} → {scamtype}")

Scam Type Mapping:
0.099654 → Phishing
0.098499 → Other
0.097611 → Online shopping
0.092726 → False billing
0.087752 → Identity theft
0.072875 → Classified
0.065103 → Hacking
0.057332 → Investment
0.045608 → Remote access
0.035438 → Dating and romance
0.033085 → Threats to life, arrest or other
0.032507 → Rebate
0.028244 → Jobs and employment
0.027267 → Travel, prizes and lottery
0.026557 → Overpayment
0.024203 → inheritance and unexpected money
0.023759 → Health and medical products
0.016653 → Ransomware and malware
0.016520 → Mobile premium services
0.008837 → Fake charity
0.005729 → Betting and sports investment
0.002132 → Pyramid schemes
0.001910 → Psychic and clairvoyant
