In [None]:
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer

def apply_feature_hashing(df, categorical_columns, n_features=5):
    hasher = HashingVectorizer(n_features=n_features, norm=None, alternate_sign=False)
    for col in categorical_columns:
        # Ensure the column is treated as string
        df[col] = df[col].astype(str)
        # Apply feature hashing
        hashed_features = hasher.transform(df[col]).toarray()
        # Create hashed feature column names
        hashed_columns = [f"{col}_hash_{i}" for i in range(n_features)]
        hashed_df = pd.DataFrame(hashed_features, columns=hashed_columns, index=df.index)
        # Replace the original column with hashed features
        df = pd.concat([df.drop(columns=[col]), hashed_df], axis=1)
    return df

categorical_columns = ['Surname', 'Geography', 'Gender']


hashed_df = apply_feature_hashing(df, categorical_columns, n_features=5)

print(hashed_df.head())



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess data
df = pd.read_csv('/kaggle/input/complete-dataset/augmented_data.csv')
df = df.drop_duplicates()

def fill_missing_values(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].fillna(df[col].mean())
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

df = fill_missing_values(df)

# Drop unnecessary columns
columns_to_drop = ['id', 'CustomerId']
df = df.drop(columns=columns_to_drop)

# Convert target variable
df['Exited'] = df['Exited'].astype(int)

# Define columns
categorical_columns = ['Surname', 'Geography', 'Gender']
numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                    'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

# Apply Feature Hashing
hasher = FeatureHasher(n_features=10, input_type='string')  # Reduced n_features for example
hashed_features = []

for col in categorical_columns:
    # Prepare data for hashing
    col_data = [[str(val)] for val in df[col]]
    # Apply hashing
    hashed = hasher.transform(col_data).toarray()
    hashed_features.append(hashed)

# Combine all hashed features
all_hashed = np.hstack(hashed_features)

# Combine with numerical features
numerical_data = df[numerical_columns].values
X = np.hstack([numerical_data, all_hashed])
y = df['Exited'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Get predictions
    train_pred = model.predict(X_train_scaled)
    test_pred = model.predict(X_test_scaled)
    
    # Calculate accuracies
    train_accuracy = accuracy_score(y_train, train_pred)
    test_accuracy = accuracy_score(y_test, test_pred)
    
    print(f"\n{model_name} Results:")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    print("\nTest Set Classification Report:")
    print(classification_report(y_test, test_pred))
    
    return train_accuracy, test_accuracy

# Initialize and evaluate models
models = {
    'SVM': SVC(kernel='rbf', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

results = {
    'Model': [],
    'Training Accuracy': [],
    'Testing Accuracy': []
}

# Train and evaluate each model
for name, model in models.items():
    print(f"\nEvaluating {name}...")
    train_acc, test_acc = evaluate_model(model, X_train_scaled, X_test_scaled, 
                                       y_train, y_test, name)
    
    results['Model'].append(name)
    results['Training Accuracy'].append(train_acc)
    results['Testing Accuracy'].append(test_acc)

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df['Accuracy Difference'] = results_df['Training Accuracy'] - results_df['Testing Accuracy']

print("\nFinal Results Summary:")
print(results_df)

# Save results
results_df.to_csv('feature_hashing_results.csv', index=False)

# For Random Forest, we can also look at feature importance
rf_model = models['Random Forest']
feature_names = (numerical_columns + 
                [f'hash_feature_{i}' for i in range(all_hashed.shape[1])])
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Evaluating SVM...

SVM Results:
Training Accuracy: 0.9105
Testing Accuracy: 0.9072

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     44639
           1       0.85      0.62      0.72     10373

    accuracy                           0.91     55012
   macro avg       0.88      0.80      0.83     55012
weighted avg       0.90      0.91      0.90     55012


Evaluating Random Forest...

Random Forest Results:
Training Accuracy: 0.9998
Testing Accuracy: 0.9118

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     44639
           1       0.83      0.67      0.74     10373

    accuracy                           0.91     55012
   macro avg       0.88      0.82      0.84     55012
weighted avg       0.91      0.91      0.91     55012


Evaluating Logistic Regression...

Logistic Regression Results:
Training Accuracy: 0.8676
T

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('/kaggle/input/complete-dataset/augmented_data.csv')
df = df.drop_duplicates()

def fill_missing_values(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].fillna(df[col].mean())
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

df = fill_missing_values(df)

print("Original dataset size:", len(df))

df['Exited'] = df['Exited'].astype(int)
df['Exited'] = (df['Exited'] > 0).astype(int)

subsample_size = 0.3
df_subsampled = df.sample(frac=subsample_size, random_state=42)
print("Subsampled dataset size:", len(df_subsampled))

columns_to_drop = ['id', 'CustomerId']
df_subsampled = df_subsampled.drop(columns=columns_to_drop)

le = LabelEncoder()
categorical_columns = ['Surname', 'Geography', 'Gender']
for col in categorical_columns:
    df_subsampled[col] = le.fit_transform(df_subsampled[col].astype(str))

X = df_subsampled.drop('Exited', axis=1)
y = df_subsampled['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate initial models
print("\nInitial Model Training:")

# SVM
print("\nTraining SVM...")
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)
print("\nSVM Results:")
print("Accuracy:", accuracy_score(y_test, svm_pred))
print("\nClassification Report:")
print(classification_report(y_test, svm_pred))

# Random Forest
print("\nTraining Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))

# Logistic Regression
print("\nTraining Logistic Regression...")
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))

# Create DataFrame with initial results
results = {
    'Model': ['SVM', 'Random Forest', 'Logistic Regression'],
    'Accuracy': [
        accuracy_score(y_test, svm_pred),
        accuracy_score(y_test, rf_pred),
        accuracy_score(y_test, lr_pred)
    ]
}

results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df)

# Function to process data and train models
def process_and_train(data, sample_size):
    # Subsample the data
    df_sub = data.sample(frac=sample_size, random_state=42)
    
    # Drop unnecessary columns
    df_sub = df_sub.drop(columns=columns_to_drop)
    
    # Handle categorical variables
    for col in categorical_columns:
        df_sub[col] = le.fit_transform(df_sub[col].astype(str))
    
    # Prepare features and target
    X = df_sub.drop('Exited', axis=1)
    y = df_sub['Exited']
    
    # Split and scale
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

# Test different sample sizes
subsample_sizes = [0.1, 0.3, 0.5]
results_by_size = []

for size in subsample_sizes:
    print(f"\nTesting with {size*100}% of data")
    
    X_train_scaled, X_test_scaled, y_train, y_test = process_and_train(df, size)
    
    models = {
        'SVM': SVC(kernel='rbf', random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
    }
    
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model.fit(X_train_scaled, y_train)
        pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, pred)
        print(f"{model_name} Accuracy: {accuracy:.4f}")
        
        results_by_size.append({
            'Sample Size': f"{size*100}%",
            'Model': model_name,
            'Accuracy': accuracy
        })

# Create DataFrame with results for different sample sizes
results_by_size_df = pd.DataFrame(results_by_size)
print("\nResults for different sample sizes:")
print(results_by_size_df.pivot(index='Model', columns='Sample Size', values='Accuracy'))

# Save results
results_by_size_df.to_csv('subsampling_results.csv', index=False)

# Feature importance analysis for Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

Original dataset size: 275058
Subsampled dataset size: 82517

Initial Model Training:

Training SVM...

SVM Results:
Accuracy: 0.9111730489578284

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     13472
           1       0.86      0.62      0.72      3032

    accuracy                           0.91     16504
   macro avg       0.89      0.80      0.83     16504
weighted avg       0.91      0.91      0.91     16504


Training Random Forest...

Random Forest Results:
Accuracy: 0.9134755210857974

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     13472
           1       0.83      0.67      0.74      3032

    accuracy                           0.91     16504
   macro avg       0.88      0.82      0.84     16504
weighted avg       0.91      0.91      0.91     16504


Training Logistic Regression...

Logistic Regression Results:
Accuracy: