In [None]:
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer

def apply_feature_hashing(df, categorical_columns, n_features=5):
    hasher = HashingVectorizer(n_features=n_features, norm=None, alternate_sign=False)
    for col in categorical_columns:
        # Ensure the column is treated as string
        df[col] = df[col].astype(str)
        # Apply feature hashing
        hashed_features = hasher.transform(df[col]).toarray()
        # Create hashed feature column names
        hashed_columns = [f"{col}_hash_{i}" for i in range(n_features)]
        hashed_df = pd.DataFrame(hashed_features, columns=hashed_columns, index=df.index)
        # Replace the original column with hashed features
        df = pd.concat([df.drop(columns=[col]), hashed_df], axis=1)
    return df

categorical_columns = ['Surname', 'Geography', 'Gender']


hashed_df = apply_feature_hashing(df, categorical_columns, n_features=5)

print(hashed_df.head())



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess data
df = pd.read_csv('/kaggle/input/complete-dataset/augmented_data.csv')
df = df.drop_duplicates()

def fill_missing_values(df):
    # For numerical columns
    numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].mean())
    
    # For categorical columns (including HasCrCard and IsActiveMember)
    categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    return df

df = fill_missing_values(df)

# Drop unnecessary columns
columns_to_drop = ['id', 'Surname', 'CustomerId']
df = df.drop(columns=columns_to_drop)

# Convert HasCrCard and IsActiveMember to categorical
df['HasCrCard'] = df['HasCrCard'].astype('category')
df['IsActiveMember'] = df['IsActiveMember'].astype('category')

# Handle missing values in target variable and convert to integer
df['Exited'] = df['Exited'].fillna(df['Exited'].mode()[0])
df['Exited'] = df['Exited'].astype(int)

# Define columns
categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

# Apply Feature Hashing
hasher = FeatureHasher(n_features=10, input_type='string')
hashed_features = []

for col in categorical_columns:
    # Convert to string and prepare data for hashing
    col_data = [[str(val)] for val in df[col]]
    # Apply hashing
    hashed = hasher.transform(col_data).toarray()
    hashed_features.append(hashed)

# Combine all hashed features
all_hashed = np.hstack(hashed_features)

# Combine with numerical features
numerical_data = df[numerical_columns].values
X = np.hstack([numerical_data, all_hashed])
y = df['Exited'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Get predictions
    train_pred = model.predict(X_train_scaled)
    test_pred = model.predict(X_test_scaled)
    
    # Calculate accuracies
    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)
    
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    print("\nTest Set Classification Report:")
    print(classification_report(y_test, test_pred))
    
    return train_accuracy, test_accuracy

# Initialize and evaluate models
models = {
    'SVM': SVC(kernel='rbf', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

results = {
    'Model': [],
    'Training Accuracy': [],
    'Testing Accuracy': []
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nEvaluating {name}...")
    train_acc, test_acc = evaluate_model(model, X_train_scaled, X_test_scaled, 
                                       y_train, y_test, name)
    
    results['Model'].append(name)
    results['Training Accuracy'].append(train_acc)
    results['Testing Accuracy'].append(test_acc)

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df['Accuracy Difference'] = results_df['Training Accuracy'] - results_df['Testing Accuracy']

print("\nFinal Results Summary:")
print(results_df)

# Save results
results_df.to_csv('feature_hashing_results.csv', index=False)

# For Random Forest, we can also look at feature importance
rf_model = models['Random Forest']
feature_names = (numerical_columns + 
                [f'hash_feature_{i}' for i in range(all_hashed.shape[1])])
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Evaluating SVM...

SVM Results:
Training Accuracy: 0.9102
Testing Accuracy: 0.9086

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95     44639
           1       0.85      0.63      0.72     10373

    accuracy                           0.91     55012
   macro avg       0.88      0.80      0.83     55012
weighted avg       0.91      0.91      0.90     55012


Evaluating Random Forest...

Random Forest Results:
Training Accuracy: 0.9996
Testing Accuracy: 0.9105

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     44639
           1       0.82      0.67      0.74     10373

    accuracy                           0.91     55012
   macro avg       0.87      0.82      0.84     55012
weighted avg       0.91      0.91      0.91     55012


Evaluating Logistic Regression...

Logistic Regression Results:
Training Accuracy: 0.8675
T

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load and preprocess data
df = pd.read_csv('/kaggle/input/complete-dataset/augmented_data.csv')
df = df.drop_duplicates()

def fill_missing_values(df):
    # For numerical columns
    numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].mean())
    
    # For categorical columns (including HasCrCard and IsActiveMember)
    categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
        df[col] = df[col].astype('category')  # Convert to categorical
    
    return df

print("Original dataset size:", len(df))

def process_and_train(data, sample_size):
    # Subsample the data
    df_sub = data.sample(frac=sample_size, random_state=42)
    print(f"Subsampled dataset size for {sample_size*100}% sample: {len(df_sub)}")
    
    df_sub = fill_missing_values(df_sub)
    
    # Drop unnecessary columns
    columns_to_drop = ['id', 'Surname', 'CustomerId']
    df_sub = df_sub.drop(columns=columns_to_drop)
    
    # Handle target variable
    df_sub['Exited'] = df_sub['Exited'].fillna(df_sub['Exited'].mode()[0])
    df_sub['Exited'] = df_sub['Exited'].astype(int)
    
    # Label encode categorical variables
    le = LabelEncoder()
    categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
    for col in categorical_columns:
        df_sub[col] = le.fit_transform(df_sub[col].astype(str))
    
    # Prepare features and target
    X = df_sub.drop('Exited', axis=1)
    y = df_sub['Exited']
    
    # Split and scale
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, X.columns

def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, model_name):
    model.fit(X_train_scaled, y_train)
    train_pred = model.predict(X_train_scaled)
    test_pred = model.predict(X_test_scaled)
    
    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)
    
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, test_pred))
    
    return train_accuracy, test_accuracy

# Test different sample sizes
subsample_sizes = [0.1, 0.3, 0.5]
results_by_size = []

for size in subsample_sizes:
    print(f"\nProcessing {size*100}% of data")
    
    X_train_scaled, X_test_scaled, y_train, y_test, feature_names = process_and_train(df, size)
    
    models = {
        'SVM': SVC(kernel='rbf', random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
    }
    
    for model_name, model in models.items():
        train_acc, test_acc = evaluate_model(
            model, X_train_scaled, X_test_scaled, y_train, y_test, model_name
        )
        
        results_by_size.append({
            'Sample Size': f"{size*100}%",
            'Model': model_name,
            'Training Accuracy': train_acc,
            'Testing Accuracy': test_acc,
            'Accuracy Difference': train_acc - test_acc
        })
        
        # For Random Forest, analyze feature importance
        if model_name == 'Random Forest':
            feature_importance = pd.DataFrame({
                'feature': feature_names,
                'importance': model.feature_importances_
            })
            feature_importance = feature_importance.sort_values('importance', ascending=False)
            print(f"\nTop 10 Important Features for {size*100}% sample:")
            print(feature_importance.head(10))

# Create final results DataFrame
results_df = pd.DataFrame(results_by_size)

# Create pivot table for better visualization
pivot_results = results_df.pivot_table(
    index='Model',
    columns='Sample Size',
    values=['Training Accuracy', 'Testing Accuracy', 'Accuracy Difference']
)

print("\nFinal Results Summary:")
print(pivot_results)

# Save results
results_df.to_csv('subsampling_results.csv', index=False)
pivot_results.to_csv('subsampling_results_pivot.csv')

Original dataset size: 275058

Processing 10.0% of data
Subsampled dataset size for 10.0% sample: 27506

SVM Results:
Training Accuracy: 0.9062
Testing Accuracy: 0.9042

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      4478
           1       0.85      0.59      0.70      1024

    accuracy                           0.90      5502
   macro avg       0.88      0.78      0.82      5502
weighted avg       0.90      0.90      0.90      5502


Random Forest Results:
Training Accuracy: 1.0000
Testing Accuracy: 0.9091

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      4478
           1       0.82      0.66      0.73      1024

    accuracy                           0.91      5502
   macro avg       0.87      0.81      0.84      5502
weighted avg       0.91      0.91      0.91      5502


Top 10 Important Features for 10.0% sample:
         