In [17]:
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer

def apply_feature_hashing(df, categorical_columns, n_features=5):
    hasher = HashingVectorizer(n_features=n_features, norm=None, alternate_sign=False)
    for col in categorical_columns:
        # Ensure the column is treated as string
        df[col] = df[col].astype(str)
        # Apply feature hashing
        hashed_features = hasher.transform(df[col]).toarray()
        # Create hashed feature column names
        hashed_columns = [f"{col}_hash_{i}" for i in range(n_features)]
        hashed_df = pd.DataFrame(hashed_features, columns=hashed_columns, index=df.index)
        # Replace the original column with hashed features
        df = pd.concat([df.drop(columns=[col]), hashed_df], axis=1)
    return df

categorical_columns = ['Surname', 'Geography', 'Gender']


hashed_df = apply_feature_hashing(df, categorical_columns, n_features=5)

print(hashed_df.head())



    id  CustomerId  CreditScore   Age  Tenure    Balance  NumOfProducts  \
0  0.0  15674932.0        668.0  33.0     3.0       0.00            2.0   
1  1.0  15749177.0        627.0  33.0     1.0       0.00            2.0   
2  2.0  15694510.0        678.0  40.0    10.0       0.00            2.0   
3  3.0  15741417.0        581.0  34.0     2.0  148882.54            1.0   
4  4.0  15766172.0        716.0  33.0     5.0       0.00            2.0   

   HasCrCard  IsActiveMember  EstimatedSalary  ...  Geography_hash_0  \
0        1.0             0.0        181449.97  ...               0.0   
1        1.0             1.0         49503.50  ...               0.0   
2        1.0             0.0        184866.69  ...               0.0   
3        1.0             1.0         84560.88  ...               0.0   
4        1.0             1.0         15068.83  ...               0.0   

   Geography_hash_1  Geography_hash_2  Geography_hash_3  Geography_hash_4  \
0               0.0               0.0  

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('/kaggle/input/complete-dataset/augmented_data.csv')
df = df.drop_duplicates()

def fill_missing_values(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].fillna(df[col].mean())
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

df = fill_missing_values(df)

columns_to_drop = ['id', 'CustomerId']
df = df.drop(columns=columns_to_drop)

df['Exited'] = df['Exited'].astype(int)

print("Unique values in Exited:", df['Exited'].unique())

print("Number of NaN values before cleaning:")
print(df.isnull().sum())

df = df.dropna()

print("\nNumber of NaN values after cleaning:")
print(df.isnull().sum())

categorical_columns = ['Surname', 'Geography', 'Gender']
numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                    'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

hasher = FeatureHasher(n_features=20, input_type='string')
hashed_features = np.zeros((len(df), 20 * len(categorical_columns)))

for i, col in enumerate(categorical_columns):
    col_data = df[col].astype(str).values
    col_data = col_data.reshape(-1, 1)
    hashed = hasher.transform(col_data).toarray()
    start_idx = i * 20
    end_idx = (i + 1) * 20
    hashed_features[:, start_idx:end_idx] = hashed

numerical_features = df[numerical_columns].values
X = np.hstack((numerical_features, hashed_features))

hashed_column_names = []
for cat_col in categorical_columns:
    for i in range(20):
        hashed_column_names.append(f'{cat_col}_hash_{i}')

df_hashed = pd.DataFrame(X, columns=numerical_columns + hashed_column_names)
df_hashed['Exited'] = df['Exited']

print("\nShape of transformed dataframe:", df_hashed.shape)
print("\nFirst few rows of the transformed dataframe:")
print(df_hashed.head())

y = df_hashed['Exited'].values.astype(int)  # Ensure target is integer
X = df_hashed.drop('Exited', axis=1).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate models
# SVM
print("\nTraining SVM...")
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)
print("\nSVM Results:")
print("Accuracy:", accuracy_score(y_test, svm_pred))
print("\nClassification Report:")
print(classification_report(y_test, svm_pred))

# Random Forest
print("\nTraining Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))

# Logistic Regression
print("\nTraining Logistic Regression...")
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))

Unique values in Exited: [0 1]
Number of NaN values before cleaning:
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Number of NaN values after cleaning:
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

Shape of transformed dataframe: (275058, 69)

First few rows of the transformed dataframe:
   CreditScore   Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0        668.0  33.0     3.0       0.00            2.0        1.0   
1        627.0  33.0     1.0       0.00            2.0        1.0   
2        678.0  40.0    10.0       0.00            2.0        1.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('/kaggle/input/complete-dataset/augmented_data.csv')
df = df.drop_duplicates()

def fill_missing_values(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].fillna(df[col].mean())
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    return df

df = fill_missing_values(df)

print("Original dataset size:", len(df))

df['Exited'] = df['Exited'].astype(int)
df['Exited'] = (df['Exited'] > 0).astype(int)

subsample_size = 0.3
df_subsampled = df.sample(frac=subsample_size, random_state=42)
print("Subsampled dataset size:", len(df_subsampled))

columns_to_drop = ['id', 'CustomerId']
df_subsampled = df_subsampled.drop(columns=columns_to_drop)

le = LabelEncoder()
categorical_columns = ['Surname', 'Geography', 'Gender']
for col in categorical_columns:
    df_subsampled[col] = le.fit_transform(df_subsampled[col].astype(str))

X = df_subsampled.drop('Exited', axis=1)
y = df_subsampled['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate initial models
print("\nInitial Model Training:")

# SVM
print("\nTraining SVM...")
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)
print("\nSVM Results:")
print("Accuracy:", accuracy_score(y_test, svm_pred))
print("\nClassification Report:")
print(classification_report(y_test, svm_pred))

# Random Forest
print("\nTraining Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))

# Logistic Regression
print("\nTraining Logistic Regression...")
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, lr_pred))
print("\nClassification Report:")
print(classification_report(y_test, lr_pred))

# Create DataFrame with initial results
results = {
    'Model': ['SVM', 'Random Forest', 'Logistic Regression'],
    'Accuracy': [
        accuracy_score(y_test, svm_pred),
        accuracy_score(y_test, rf_pred),
        accuracy_score(y_test, lr_pred)
    ]
}

results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df)

# Function to process data and train models
def process_and_train(data, sample_size):
    # Subsample the data
    df_sub = data.sample(frac=sample_size, random_state=42)
    
    # Drop unnecessary columns
    df_sub = df_sub.drop(columns=columns_to_drop)
    
    # Handle categorical variables
    for col in categorical_columns:
        df_sub[col] = le.fit_transform(df_sub[col].astype(str))
    
    # Prepare features and target
    X = df_sub.drop('Exited', axis=1)
    y = df_sub['Exited']
    
    # Split and scale
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

# Test different sample sizes
subsample_sizes = [0.1, 0.3, 0.5]
results_by_size = []

for size in subsample_sizes:
    print(f"\nTesting with {size*100}% of data")
    
    X_train_scaled, X_test_scaled, y_train, y_test = process_and_train(df, size)
    
    models = {
        'SVM': SVC(kernel='rbf', random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
    }
    
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model.fit(X_train_scaled, y_train)
        pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, pred)
        print(f"{model_name} Accuracy: {accuracy:.4f}")
        
        results_by_size.append({
            'Sample Size': f"{size*100}%",
            'Model': model_name,
            'Accuracy': accuracy
        })

# Create DataFrame with results for different sample sizes
results_by_size_df = pd.DataFrame(results_by_size)
print("\nResults for different sample sizes:")
print(results_by_size_df.pivot(index='Model', columns='Sample Size', values='Accuracy'))

# Save results
results_by_size_df.to_csv('subsampling_results.csv', index=False)

# Feature importance analysis for Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

Original dataset size: 275058
Subsampled dataset size: 82517

Initial Model Training:

Training SVM...

SVM Results:
Accuracy: 0.9111730489578284

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95     13472
           1       0.86      0.62      0.72      3032

    accuracy                           0.91     16504
   macro avg       0.89      0.80      0.83     16504
weighted avg       0.91      0.91      0.91     16504


Training Random Forest...

Random Forest Results:
Accuracy: 0.9134755210857974

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     13472
           1       0.83      0.67      0.74      3032

    accuracy                           0.91     16504
   macro avg       0.88      0.82      0.84     16504
weighted avg       0.91      0.91      0.91     16504


Training Logistic Regression...

Logistic Regression Results:
Accuracy: