In [None]:
# --- 1. LOAD DATASET ---
print("--- Section 1: Loading Dataset ---")

# Replace this with your actual CSV file path
DATASET_FILE = "your_dataset.csv"

# Read dataset
df = pd.read_csv(DATASET_FILE)
print(f"Loaded dataset: {df.shape[0]} rows, {df.shape[1]} columns")

# Optional: Convert timestamp column if available
if 'LoginTimestamp' in df.columns:
    df['LoginTimestamp'] = pd.to_datetime(df['LoginTimestamp'], errors='coerce')

print("--- Data Head ---")
print(df.head())





# --- 2. FEATURE ENGINEERING ---
print("--- Section 2: Feature Engineering ---")

# IP Clustering
ip_counts = df.groupby('IPAddress')['UserID'].nunique().reset_index()
ip_counts.columns = ['IPAddress', 'IPUserCount']
df = pd.merge(df, ip_counts, on='IPAddress', how='left')

# Geo-inconsistency
df = df.sort_values(by=['UserID', 'LoginTimestamp'])
df['TimeDiff'] = df.groupby('UserID')['LoginTimestamp'].diff().dt.total_seconds().div(3600)
df['PrevLat'] = df.groupby('UserID')['Latitude'].shift()
df['PrevLon'] = df.groupby('UserID')['Longitude'].shift()

def calculate_haversine(lat1, lon1, lat2, lon2):
    if pd.isna(lat1) or pd.isna(lon1):
        return 0
    return haversine((lat1, lon1), (lat2, lon2))

df['Distance'] = df.apply(lambda row: calculate_haversine(row['PrevLat'], row['PrevLon'], row['Latitude'], row['Longitude']), axis=1)
df['Speed'] = df['Distance'].div(df['TimeDiff']).fillna(0)

# Behavioral Profiling
df['PurchaseToBrowseRatio'] = df['Purchases'].div(df['BrowsingEvents']).fillna(0)

# Device-switch anomalies
device_counts = df.groupby('UserID')['DeviceType'].nunique().reset_index()
device_counts.columns = ['UserID', 'DeviceCount']
df = pd.merge(df, device_counts, on='UserID', how='left')

# Label Encoding
for col in ['DeviceType', 'Browser']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

print("Feature engineering complete. New features added: IPUserCount, Speed, PurchaseToBrowseRatio, DeviceCount")
print(df[['UserID', 'IPUserCount', 'Speed', 'PurchaseToBrowseRatio', 'DeviceCount']].head())


# --- 3. MACHINE LEARNING PIPELINE ---
print("--- Section 3: Machine Learning Pipeline ---")

features = [
    'IPUserCount', 'SessionDuration', 'BrowsingEvents', 'Purchases', 
    'AverageOrderValue', 'CartAbandonmentRate', 'Speed', 
    'PurchaseToBrowseRatio', 'DeviceCount', 'DeviceType', 'Browser'
]
target = 'IsFake'

X = df[features].fillna(0)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Classification Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    print(f'--- {name} ---')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print(f'Precision: {precision_score(y_test, y_pred):.4f}')
    print(f'Recall: {recall_score(y_test, y_pred):.4f}')
    print(f'F1 Score: {f1_score(y_test, y_pred):.4f}')
    print(f'ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}')
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Legit', 'Fake'], yticklabels=['Legit', 'Fake'])
    plt.title(f'{name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    cm_filename = f"confusion_matrix_{name.replace(' ', '_')}.png"
    plt.savefig(cm_filename)
    print(f"Saved {cm_filename}")
    plt.close()

# Anomaly Detection
iso_forest = IsolationForest(contamination='auto', random_state=42)
df['AnomalyScore_ISO'] = iso_forest.fit_predict(X.fillna(0))
# -1 is anomaly, 1 is normal. We map it to 1 for anomaly, 0 for normal.
df['IsAnomaly_ISO'] = df['AnomalyScore_ISO'].apply(lambda x: 1 if x == -1 else 0)

print('--- Isolation Forest Anomaly Detection ---')
print(f"Detected {df['IsAnomaly_ISO'].sum()} anomalies out of {len(df)} records.")


# --- 4. FAKE USER DETECTION DASHBOARD (Script Version) ---
print("--- Section 4: Fake User Detection Dashboard ---")

# Rich Table for Suspicious Users
suspicious_users = df[df['IsAnomaly_ISO'] == 1].copy()
suspicious_users['Reason'] = ''
suspicious_users.loc[suspicious_users['IPUserCount'] > 2, 'Reason'] += 'IP Cluster; '
suspicious_users.loc[suspicious_users['Speed'] > 1000, 'Reason'] += 'Impossible Travel; '
suspicious_users.loc[suspicious_users['Purchases'] > 10, 'Reason'] += 'Transaction Spike; '
suspicious_users.loc[suspicious_users['DeviceCount'] > 2, 'Reason'] += 'Device Switch; '

table = Table(title="Top 10 Suspicious User Activities (Detected by Isolation Forest)")
table.add_column("Username", style="cyan")
table.add_column("IP Address", style="magenta")
table.add_column("Detected Reason(s)", style="green")

for _, row in suspicious_users.head(10).iterrows():
    table.add_row(row['Username'], row['IPAddress'], row['Reason'])

console = Console()
console.print(table)

# Generate Interactive Charts as HTML files
print("Generating interactive charts as HTML files...")

# Geo-map of suspicious accounts
fig_map = px.scatter_geo(
    suspicious_users, 
    lat='Latitude', lon='Longitude', 
    color='Reason',
    hover_name='Username', size='AverageOrderValue',
    title='Suspicious Accounts Geo-Map (Anomalies)',
    projection="natural earth"
)
map_filename = "dashboard_geo_map.html"
fig_map.write_html(map_filename)
print(f"Saved {map_filename}")

# Anomaly scores distribution
fig_hist = px.histogram(df, x='IsAnomaly_ISO', color='IsFake', barmode='group', title='Anomaly Detection vs. True Labels')
hist_filename = "dashboard_histogram.html"
fig_hist.write_html(hist_filename)
print(f"Saved {hist_filename}")

# Timeline of user activity
fig_timeline = px.scatter(
    df.sample(n=min(2000, len(df))), # Sample to keep timeline readable
    x='LoginTimestamp', y='Username', color='IsFake', 
    title='User Activity Timeline (Sample)',
    labels={'Username': 'Users'}
)
fig_timeline.update_traces(marker=dict(size=5, opacity=0.7))
timeline_filename = "dashboard_timeline.html"
fig_timeline.write_html(timeline_filename)
print(f"Saved {timeline_filename}")

print("--- Project Execution Complete ---")

--- Section 1: Synthetic Data Generation ---
synthetic_ecommerce_data.csv already exists. Skipping generation.
--- Data Head ---
   UserID       Username       IPAddress DeviceType Browser  \
0     377  kimberlygreen  84.189.196.218    Desktop  Chrome   
1     398    smithnathan   11.210.138.98     Mobile  Chrome   
2      68      cameron56   216.50.219.65     Mobile  Safari   
3     200   douglasjones    68.38.34.245     Tablet  Chrome   
4     151         lwalsh    21.48.237.85     Mobile  Chrome   

       LoginTimestamp       City  Country  Latitude  Longitude  \
0 2025-01-30 10:51:24     Berlin  Germany   52.5200    13.4050   
1 2025-08-04 14:47:10      Paris   France   48.8566     2.3522   
2 2025-07-12 02:56:58  São Paulo   Brazil  -23.5505   -46.6333   
3 2025-07-16 15:02:19      Tokyo    Japan   35.6895   139.6917   
4 2024-09-02 00:04:20      Tokyo    Japan   35.6895   139.6917   

   FailedLoginAttempts  Purchases  AverageOrderValue  CartAbandonmentRate  \
0                 


Parameters: { "use_label_encoder" } are not used.




Saved confusion_matrix_XGBoost.png
--- Isolation Forest Anomaly Detection ---
Detected 598 anomalies out of 5000 records.
--- Section 4: Fake User Detection Dashboard ---


Generating interactive charts as HTML files...
Saved dashboard_geo_map.html
Saved dashboard_histogram.html
Saved dashboard_timeline.html
--- Project Execution Complete ---


In [10]:
import webbrowser
import os

file_path = os.path.abspath('dashboard_geo_map.html')
webbrowser.open('file://' + file_path)


True

In [11]:
import webbrowser
import os

file_path = os.path.abspath('dashboard_timeline.html')
webbrowser.open('file://' + file_path)

True

In [12]:
import webbrowser
import os

file_path = os.path.abspath('dashboard_histogram.html')
webbrowser.open('file://' + file_path)

True