In [2]:
# %pip install pandas
# %pip install matplotlib
# %pip install imbalanced-learn
# %pip install numpy

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
df = pd.read_csv('good_customer.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1723 entries, 0 to 1722
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   month                1723 non-null   int64 
 1   credit_amount        1723 non-null   int64 
 2   credit_term          1723 non-null   int64 
 3   age                  1723 non-null   int64 
 4   sex                  1723 non-null   object
 5   education            1723 non-null   object
 6   product_type         1723 non-null   object
 7   having_children_flg  1723 non-null   int64 
 8   region               1723 non-null   int64 
 9   income               1723 non-null   int64 
 10  family_status        1723 non-null   object
 11  phone_operator       1723 non-null   int64 
 12  is_client            1723 non-null   int64 
 13  bad_client_target    1723 non-null   int64 
dtypes: int64(10), object(4)
memory usage: 188.6+ KB


In [5]:
df.head(10)

Unnamed: 0,month,credit_amount,credit_term,age,sex,education,product_type,having_children_flg,region,income,family_status,phone_operator,is_client,bad_client_target
0,1,7000,12,39,male,Secondary special education,Cell phones,0,2,21000,Another,0,0,0
1,1,19000,6,20,male,Secondary special education,Household appliances,1,2,17000,Another,3,1,0
2,1,29000,12,23,female,Secondary special education,Household appliances,0,2,31000,Another,2,0,0
3,1,10000,12,30,male,Secondary special education,Cell phones,1,2,31000,Unmarried,3,1,0
4,1,14500,12,25,female,Higher education,Cell phones,0,2,26000,Married,0,1,0
5,1,32500,24,47,female,Secondary special education,Furniture,0,2,26000,Married,0,1,0
6,1,8000,3,23,male,Higher education,Computers,0,2,21000,Another,0,1,0
7,1,20000,10,25,female,Higher education,Household appliances,0,0,33000,Married,2,1,0
8,1,26000,6,21,female,Secondary special education,Cell phones,0,0,31000,Another,2,1,0
9,1,15000,24,25,female,Secondary special education,Household appliances,1,2,26000,Another,3,0,0


In [6]:
rows_with_nan = df.isna().any(axis=1)

# Display rows that contain at least one NaN value
nan_rows = df[rows_with_nan]

nan_rows

Unnamed: 0,month,credit_amount,credit_term,age,sex,education,product_type,having_children_flg,region,income,family_status,phone_operator,is_client,bad_client_target


In [7]:
df.describe()

Unnamed: 0,month,credit_amount,credit_term,age,having_children_flg,region,income,phone_operator,is_client,bad_client_target
count,1723.0,1723.0,1723.0,1723.0,1723.0,1723.0,1723.0,1723.0,1723.0,1723.0
mean,6.708067,29264.654672,11.546721,35.911782,0.428323,1.68137,32652.350551,1.125363,0.604759,0.113755
std,3.53842,27926.778301,6.548354,13.120203,0.494979,0.704256,20913.193158,1.015822,0.489044,0.317606
min,1.0,5000.0,3.0,18.0,0.0,0.0,1000.0,0.0,0.0,0.0
25%,3.0,13000.0,6.0,26.0,0.0,2.0,21000.0,0.0,0.0,0.0
50%,7.0,21500.0,12.0,32.0,0.0,2.0,27000.0,1.0,1.0,0.0
75%,10.0,34000.0,12.0,44.0,1.0,2.0,38000.0,2.0,1.0,0.0
max,12.0,301000.0,36.0,90.0,1.0,2.0,401000.0,4.0,1.0,1.0


In [8]:
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE


label_encoder = LabelEncoder()
df["sex"] = label_encoder.fit_transform(df["sex"])
df["education"] = label_encoder.fit_transform(df["education"])
df["product_type"] = label_encoder.fit_transform(df["product_type"])
df["family_status"] = label_encoder.fit_transform(df["family_status"])


X = df.drop("bad_client_target", axis=1)
y = df["bad_client_target"]

smote = SMOTE(sampling_strategy={1: 1500}, k_neighbors=4, random_state=10)
X_resampled, y_resampled = smote.fit_resample(X, y)


resampled_quality_distribution = pd.Series(y_resampled).value_counts().sort_index()

resampled_quality_distribution

0    1527
1    1500
Name: bad_client_target, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
    X_resampled, y_resampled, test_size=0.25, random_state=10)

X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(
    X, y, test_size=0.25, random_state=10)

classifiers = {
    "Decision Tree": DecisionTreeClassifier(random_state=10),
    "SVM": SVC(random_state=10),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=10),
    "Naive Bayes": GaussianNB()
}

def train_evaluate_classifiers(classifiers, X_train, y_train, X_test, y_test):
    accuracies = {}
    for name, clf in classifiers.items():
        # Train the classifier
        clf.fit(X_train, y_train)

        # Predict on the test set
        y_pred = clf.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        accuracies[name] = accuracy
        print(f"{name} Accuracy: {accuracy:.4f}")

    return accuracies


scaler_resampled = StandardScaler()
X_train_resampled_scaled = scaler_resampled.fit_transform(X_train_resampled)
X_test_resampled_scaled = scaler_resampled.transform(X_test_resampled)

scaler_original = StandardScaler()
X_train_original_scaled = scaler_original.fit_transform(X_train_original)
X_test_original_scaled = scaler_original.transform(X_test_original)

print("Performance on Scaled Resampled Data:")
resampled_accuracies_scaled = train_evaluate_classifiers(classifiers, X_train_resampled_scaled, y_train_resampled, X_test_resampled_scaled, y_test_resampled)

print("\nPerformance on Scaled Original Data:")
original_accuracies_scaled = train_evaluate_classifiers(classifiers, X_train_original_scaled, y_train_original, X_test_original_scaled, y_test_original)

Performance on Scaled Resampled Data:
Decision Tree Accuracy: 0.8375
SVM Accuracy: 0.8230
Logistic Regression Accuracy: 0.7358
Naive Bayes Accuracy: 0.7160

Performance on Scaled Original Data:
Decision Tree Accuracy: 0.7935
SVM Accuracy: 0.8910
Logistic Regression Accuracy: 0.8886
Naive Bayes Accuracy: 0.8283


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

param_grids = {
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly']
    },
    "Logistic Regression": {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'solver': ['saga'],
        'l1_ratio': [None, 0.2, 0.4, 0.6, 0.8]  # Use None for other penalties, specific values for 'elasticnet'
    },
    "Naive Bayes": {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
    }
}

def hyperparameter_optimization(classifiers, param_grids, X_train, y_train):
    best_params = {}
    for name, clf in classifiers.items():
        print(f"Optimizing {name}...")

        # Custom iterator to skip incompatible combinations
        param_iter = (params for params in ParameterGrid(param_grids[name])
                      if not (params['penalty'] == 'elasticnet' and params['l1_ratio'] is None))

        # Grid search for hyperparameters
        grid_search = GridSearchCV(clf, param_grid=param_grids[name], cv=5, n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)

        # Store best parameters
        best_params[name] = grid_search.best_params_
        print(f"Best parameters for {name}: {grid_search.best_params_}\n")

    return best_params


optimized_parameters = hyperparameter_optimization(classifiers, param_grids, X_train_resampled_scaled, y_train_resampled)

Optimizing Decision Tree...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5}

Optimizing SVM...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters for SVM: {'C': 10, 'kernel': 'rbf'}

Optimizing Logistic Regression...
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best parameters for Logistic Regression: {'C': 0.1, 'l1_ratio': 0.6, 'penalty': 'elasticnet', 'solver': 'saga'}

Optimizing Naive Bayes...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters for Naive Bayes: {'var_smoothing': 1e-09}



30 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\abhij\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\abhij\anaconda3\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\abhij\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1179, in fit
    raise ValueError("l1_ratio must be specified when penalty is elasticnet.")
ValueError: l1_ratio must be specified when penalty is elasticnet.

 0.70088106 0.71629956

In [11]:
optimized_parameters

{'Decision Tree': {'max_depth': None,
  'min_samples_leaf': 2,
  'min_samples_split': 5},
 'SVM': {'C': 10, 'kernel': 'rbf'},
 'Logistic Regression': {'C': 0.1,
  'l1_ratio': 0.6,
  'penalty': 'elasticnet',
  'solver': 'saga'},
 'Naive Bayes': {'var_smoothing': 1e-09}}

In [12]:
def plot_decision_boundaries(X, y, model, title):
    # Define the grid range
    h = 0.02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Predict class for each point in the grid
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plotting
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors="k", s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title(title)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [14]:
# Model initializations
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(probability=True),
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Naive Bayes": GaussianNB(),
}

In [15]:
# Convert the 'type' column to numerical values using Label Encoding
label_encoder = LabelEncoder()

#### Baseline Performance of Good Customer data

In [33]:
accuracy_results = {name: [] for name in classifiers}
f1_score_results = {name: [] for name in classifiers}
# strategies = ["auto", "minority", "all", "not majority", "not minority"]

# smote = SMOTE(sampling_strategy="auto", random_state=10, k_neighbors=4)
# enn = EditedNearestNeighbours(sampling_strategy="auto", n_neighbors=4)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=10
)

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    accuracy_results[name].append(accuracy)
    f1 = f1_score(y_test, predictions, average="weighted")
    f1_score_results[name].append(f1)
    print(f"F1 Score for {name}: {f1:.4f}")
    print(f"Accuracy for {name}: {accuracy:.4f}")
    print(
        f"Classification Report for {name}:\n",
        classification_report(y_test, predictions),
    )
    print("-" * 50)

F1 Score for Decision Tree: 0.7962
Accuracy for Decision Tree: 0.7756
Classification Report for Decision Tree:
               precision    recall  f1-score   support

           0       0.90      0.84      0.87       460
           1       0.17      0.26      0.21        57

    accuracy                           0.78       517
   macro avg       0.54      0.55      0.54       517
weighted avg       0.82      0.78      0.80       517

--------------------------------------------------
F1 Score for SVM: 0.8378
Accuracy for SVM: 0.8897
Classification Report for SVM:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94       460
           1       0.00      0.00      0.00        57

    accuracy                           0.89       517
   macro avg       0.44      0.50      0.47       517
weighted avg       0.79      0.89      0.84       517

--------------------------------------------------
F1 Score for Logistic Regression: 0.8378
Accuracy 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
if __name__ == '__main__':
    n = int(input())
    integer_list = map(int, input().split())

    # Create a tuple
    tuple_of_integers = tuple(integer_list)

    # Print the hash of the tuple
    result = hash(tuple_of_integers)
    print(result)


529344067295497451
