In [3]:
import csv
import random
import string

# --- Configuration ---
FILE_NAME = 'fake_names.csv'
NUM_ROWS_TO_GENERATE = 20000
FAKE_TITLES = ['Mr', 'Ms', 'Mrs', 'Mstr']
# --- End Configuration ---

def generate_fake_name():
    """
    Generates a single 'fake' name string.
    A fake name has a chance to include:
    - Irregular capitalization
    - Numbers
    - Symbols
    """
    
    # Start with a base of random lowercase letters
    length = random.randint(4, 10)
    name_chars = list(''.join(random.choices(string.ascii_lowercase, k=length)))
    
    # 1. Add numbers (30% chance)
    if random.random() < 0.3:
        num_to_add = random.randint(1, 2) # Add 1 or 2 numbers
        for _ in range(num_to_add):
            insert_pos = random.randint(0, length - 1)
            name_chars[insert_pos] = random.choice(string.digits)
            
    # 2. Add symbols (25% chance)
    if random.random() < 0.25:
        # Only use a limited set of symbols
        symbols = '!@#$%_&'
        num_to_add = 1
        for _ in range(num_to_add):
            insert_pos = random.randint(0, length - 1)
            name_chars[insert_pos] = random.choice(symbols)

    # 3. Add irregular capitalization (40% chance)
    if random.random() < 0.4:
        num_to_capitalize = random.randint(1, 3)
        for _ in range(num_to_capitalize):
            insert_pos = random.randint(0, length - 1)
            # Make sure we're capitalizing a letter, not a number/symbol
            if name_chars[insert_pos] in string.ascii_lowercase:
                name_chars[insert_pos] = name_chars[insert_pos].upper()
    else:
        # Default to normal .title() case if no irregular caps
        # (and it doesn't start with a number/symbol)
        if name_chars[0] in string.ascii_lowercase:
            name_chars[0] = name_chars[0].upper()

    return "".join(name_chars)

def create_fake_name_csv():
    """
    Creates a CSV file with fake passenger data.
    """
    try:
        with open(FILE_NAME, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            
            # Write the header row
            writer.writerow(['Title', 'FirstName', 'MiddleName', 'LastName', 'is_real'])
            
            # Write the data rows
            for _ in range(NUM_ROWS_TO_GENERATE):
                title = random.choice(FAKE_TITLES)
                first_name = generate_fake_name()
                
                # --- MODIFIED LINE ---
                # MiddleName is now always empty
                middle_name = ''
                # --- END MODIFICATION ---
                
                last_name = generate_fake_name()
                
                # The 'is_real' column is always 0
                is_real = 0
                
                writer.writerow([title, first_name, middle_name, last_name, is_real])
                
        print(f"Successfully generated {NUM_ROWS_TO_GENERATE} fake names in '{FILE_NAME}'")
        print("You can now load this file along with your 'real names' file.")

    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function to create the file
if __name__ == "__main__":
    create_fake_name_csv()

Successfully generated 20000 fake names in 'fake_names.csv'
You can now load this file along with your 'real names' file.


In [6]:
!pip install openpyxl



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# STEP 1: LOAD AND PREPARE DATA
# ============================================================================

print("=" * 70)
print("STEP 1: Loading and Preparing Data")
print("=" * 70)

# Load the real names file (Excel format)
real_names_df = pd.read_excel('Passenger Data A-S.xlsx', sheet_name='Sheet1')
print(f"\nReal names loaded: {len(real_names_df)} records")

# Add the target column for real names (is_real = 1)
real_names_df['is_real'] = 1

# Load the fake names file (already has is_real = 0)
fake_names_df = pd.read_csv('fake_names.csv')
print(f"Fake names loaded: {len(fake_names_df)} records")

# Combine both DataFrames
combined_df = pd.concat([real_names_df, fake_names_df], ignore_index=True)
print(f"\nCombined dataset: {len(combined_df)} total records")

# Crucial: Shuffle the rows to mix real and fake samples
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
print("Data shuffled successfully")

# Display the distribution of classes
print(f"\nClass Distribution:")
print(f"  Real names (is_real=1): {(combined_df['is_real'] == 1).sum()}")
print(f"  Fake names (is_real=0): {(combined_df['is_real'] == 0).sum()}")

# ============================================================================
# STEP 2: FEATURE ENGINEERING
# ============================================================================

print("\n" + "=" * 70)
print("STEP 2: Feature Engineering")
print("=" * 70)

# Create a features DataFrame with the relevant columns
features = combined_df[['Title', 'FirstName', 'LastName']].copy()

# Handle missing values in ALL columns BEFORE creating features
# This is crucial to prevent NaN errors
features['Title'] = features['Title'].fillna('Unknown').astype(str)
features['FirstName'] = features['FirstName'].fillna('').astype(str)
features['LastName'] = features['LastName'].fillna('').astype(str)

print("\nCreating name-based features...")

# --- FirstName Features ---
features['fn_length'] = features['FirstName'].str.len()
features['fn_has_number'] = features['FirstName'].str.contains(r'\d', regex=True).astype(int)
features['fn_has_symbol'] = features['FirstName'].str.contains(r'[!@#$%_&]', regex=True).astype(int)

# Vowel ratio for FirstName
def calculate_vowel_ratio(name):
    if len(name) == 0:
        return 0
    vowels = sum(1 for char in name.lower() if char in 'aeiou')
    return vowels / len(name)

features['fn_vowel_ratio'] = features['FirstName'].apply(calculate_vowel_ratio)

# Irregular capitalization (capital letter after first character)
def has_irregular_caps(name):
    if len(name) <= 1:
        return 0
    return int(any(char.isupper() for char in name[1:]))

features['fn_has_irregular_caps'] = features['FirstName'].apply(has_irregular_caps)

# --- LastName Features ---
features['ln_length'] = features['LastName'].str.len()
features['ln_has_number'] = features['LastName'].str.contains(r'\d', regex=True).astype(int)
features['ln_has_symbol'] = features['LastName'].str.contains(r'[!@#$%_&]', regex=True).astype(int)
features['ln_vowel_ratio'] = features['LastName'].apply(calculate_vowel_ratio)
features['ln_has_irregular_caps'] = features['LastName'].apply(has_irregular_caps)

print("Name-based features created:")
print("  - Length (fn_length, ln_length)")
print("  - Has number (fn_has_number, ln_has_number)")
print("  - Has symbol (fn_has_symbol, ln_has_symbol)")
print("  - Vowel ratio (fn_vowel_ratio, ln_vowel_ratio)")
print("  - Irregular caps (fn_has_irregular_caps, ln_has_irregular_caps)")

# --- One-Hot Encode Title Column ---
print("\nEncoding Title column with OneHotEncoder...")
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
title_encoded = encoder.fit_transform(features[['Title']])

# Create column names for the encoded features
title_columns = [f'title_{cat}' for cat in encoder.categories_[0]]

# Convert to DataFrame
title_encoded_df = pd.DataFrame(title_encoded, columns=title_columns, index=features.index)

# Drop original Title, FirstName, LastName columns and concatenate encoded features
features_final = features.drop(['Title', 'FirstName', 'LastName'], axis=1)
features_final = pd.concat([features_final, title_encoded_df], axis=1)

print(f"One-hot encoding complete: {len(title_columns)} title categories")
print(f"\nTotal features created: {features_final.shape[1]}")

# ============================================================================
# STEP 3: MODEL TRAINING
# ============================================================================

print("\n" + "=" * 70)
print("STEP 3: Model Training")
print("=" * 70)

# Define target variable (y) and feature matrix (X)
y = combined_df['is_real']
X = features_final

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")

# Split data into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Initialize XGBoost Classifier
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    eval_metric='logloss'
)

print("\nTraining XGBoost model...")
xgb_model.fit(X_train, y_train)
print("Model training complete!")

# ============================================================================
# STEP 4: MODEL EVALUATION
# ============================================================================

print("\n" + "=" * 70)
print("STEP 4: Model Evaluation")
print("=" * 70)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"\n{'ACCURACY SCORE':^70}")
print("-" * 70)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Classification Report
print(f"\n{'CLASSIFICATION REPORT':^70}")
print("-" * 70)
print(classification_report(y_test, y_pred, target_names=['Fake (0)', 'Real (1)']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\n{'CONFUSION MATRIX':^70}")
print("-" * 70)
print(f"{'':>20} {'Predicted Fake':>15} {'Predicted Real':>15}")
print(f"{'Actual Fake':>20} {cm[0][0]:>15} {cm[0][1]:>15}")
print(f"{'Actual Real':>20} {cm[1][0]:>15} {cm[1][1]:>15}")

# Additional insights
print(f"\n{'MODEL INSIGHTS':^70}")
print("-" * 70)
print(f"True Negatives (Correctly identified fake): {cm[0][0]}")
print(f"False Positives (Fake classified as real): {cm[0][1]}")
print(f"False Negatives (Real classified as fake): {cm[1][0]}")
print(f"True Positives (Correctly identified real): {cm[1][1]}")

# Feature importance (top 10)
print(f"\n{'TOP 10 MOST IMPORTANT FEATURES':^70}")
print("-" * 70)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance.head(10).iterrows():
    print(f"{row['feature']:>30}: {row['importance']:.4f}")

print("\n" + "=" * 70)
print("Pipeline Complete!")
print("=" * 70)

STEP 1: Loading and Preparing Data

Real names loaded: 139952 records
Fake names loaded: 20000 records

Combined dataset: 159952 total records
Data shuffled successfully

Class Distribution:
  Real names (is_real=1): 139952
  Fake names (is_real=0): 20000

STEP 2: Feature Engineering

Creating name-based features...
Name-based features created:
  - Length (fn_length, ln_length)
  - Has number (fn_has_number, ln_has_number)
  - Has symbol (fn_has_symbol, ln_has_symbol)
  - Vowel ratio (fn_vowel_ratio, ln_vowel_ratio)
  - Irregular caps (fn_has_irregular_caps, ln_has_irregular_caps)

Encoding Title column with OneHotEncoder...
One-hot encoding complete: 121 title categories

Total features created: 131

STEP 3: Model Training

Feature matrix shape: (159952, 131)
Target variable shape: (159952,)

Training set: 127961 samples
Test set: 31991 samples

Training XGBoost model...
Model training complete!

STEP 4: Model Evaluation

                            ACCURACY SCORE                     