In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
train_original=pd.read_csv('train.csv')
test_original=pd.read_csv('test.csv')

In [3]:
train_original.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [4]:
train_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26320 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26189 non-null  float64
 11  measurement_4   26032 non-null  float64
 12  measurement_5   25894 non-null  float64
 13  measurement_6   25774 non-null  float64
 14  measurement_7   25633 non-null  float64
 15  measurement_8   25522 non-null  float64
 16  measurement_9   25343 non-null  float64
 17  measurement_10  25270 non-null 

In [5]:
corr_matrix = train_original.corr()

In [6]:
corr_matrix["failure"].sort_values(ascending=False)

failure           1.000000
loading           0.129089
measurement_17    0.033905
measurement_5     0.018079
measurement_8     0.017119
measurement_7     0.016787
measurement_2     0.015808
measurement_6     0.014791
measurement_0     0.009646
attribute_2       0.006337
measurement_14    0.006211
measurement_12    0.004398
measurement_3     0.003577
measurement_16    0.002237
measurement_10   -0.001515
measurement_13   -0.001831
measurement_15   -0.003544
measurement_9    -0.003587
measurement_11   -0.004801
id               -0.007545
measurement_4    -0.010488
measurement_1    -0.010810
attribute_3      -0.019222
Name: failure, dtype: float64

In [7]:
train_original.drop('product_code', axis=1, inplace=True)

In [8]:
train_original.drop('attribute_1', axis=1, inplace=True)

In [9]:
train_original = pd.get_dummies(train_original, columns=['attribute_0'])

In [10]:
train_original["loading"].fillna(train_original["loading"].median(skipna=True), inplace=True)

In [11]:
for i in range(3,23):
    cols=train_original.iloc[:,i]
    mean = np.mean(cols)
    std=np.std(cols)
    for j in range(0,26570):
        if str(cols[j])=='nan':
            cols[j]=np.random.normal(mean,std)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cols[j]=np.random.normal(mean,std)


In [12]:
train_original.isnull().sum()

id                        0
loading                   0
attribute_2               0
attribute_3               0
measurement_0             0
measurement_1             0
measurement_2             0
measurement_3             0
measurement_4             0
measurement_5             0
measurement_6             0
measurement_7             0
measurement_8             0
measurement_9             0
measurement_10            0
measurement_11            0
measurement_12            0
measurement_13            0
measurement_14            0
measurement_15            0
measurement_16            0
measurement_17            0
failure                   0
attribute_0_material_5    0
attribute_0_material_7    0
dtype: int64

In [13]:
train_original.head()

Unnamed: 0,id,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,...,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,attribute_0_material_5,attribute_0_material_7
0,0,80.1,9,5,7,8,4,18.04,12.518,15.748,...,17.594,15.193,15.029,15.913467,13.034,14.684,764.1,0,0,1
1,1,84.89,9,5,14,3,3,18.213,11.54,17.717,...,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0,0,1
2,2,82.43,9,5,12,1,5,18.057,11.652,16.738,...,19.16104,13.798,16.711,18.631,14.094,17.946,663.376,0,0,1
3,3,101.07,9,5,13,2,6,17.295,11.188,18.576,...,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0,0,1
4,4,188.06,9,5,9,2,8,19.346,12.95,16.99,...,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0,0,1


In [14]:
test_original.drop('product_code', axis=1, inplace=True)

In [15]:
test_original.drop('attribute_1', axis=1, inplace=True)

In [16]:
test_original = pd.get_dummies(test_original, columns=['attribute_0'])

In [17]:
test_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20775 entries, 0 to 20774
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      20775 non-null  int64  
 1   loading                 20552 non-null  float64
 2   attribute_2             20775 non-null  int64  
 3   attribute_3             20775 non-null  int64  
 4   measurement_0           20775 non-null  int64  
 5   measurement_1           20775 non-null  int64  
 6   measurement_2           20775 non-null  int64  
 7   measurement_3           20446 non-null  float64
 8   measurement_4           20366 non-null  float64
 9   measurement_5           20267 non-null  float64
 10  measurement_6           20151 non-null  float64
 11  measurement_7           20055 non-null  float64
 12  measurement_8           19929 non-null  float64
 13  measurement_9           19871 non-null  float64
 14  measurement_10          19708 non-null

In [18]:
test_original["loading"].fillna(test_original["loading"].median(skipna=True), inplace=True)

In [19]:
for i in range(2,22):
    cols=test_original.iloc[:,i]
    mean = np.mean(cols)
    std=np.std(cols)
    for j in range(0,20775):
        if str(cols[j])=='nan':
            cols[j]=np.random.normal(mean,std)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cols[j]=np.random.normal(mean,std)


In [23]:
cols=["id","loading","attribute_2","attribute_3","measurement_0","measurement_1","measurement_2","measurement_3","measurement_4","measurement_5","measurement_6","measurement_7","measurement_8","measurement_9","measurement_10","measurement_11","measurement_12","measurement_13","measurement_14","measurement_15","measurement_16","measurement_17","attribute_0_material_5","attribute_0_material_7"]
X=train_original[cols]
y=train_original['failure']
X_test=test_original[cols]

In [20]:
from imblearn.under_sampling import RandomUnderSampler

In [27]:
rus = RandomUnderSampler()

In [66]:
X_resampled, y_resampled = rus.fit_resample(X, y)
X_test_resampled=rus.fit_resample(X_test)

ValueError: Found input variables with inconsistent numbers of samples: [20775, 11298]

In [53]:
X_resampled = pd.DataFrame(X_resampled)
y_resampled = pd.DataFrame(y_resampled)
y_resampled.columns = ['failure']
undersampled_data = pd.concat([X_resampled, y_resampled], axis=1)

In [55]:
X = undersampled_data.values
y = undersampled_data.failure.values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [22]:
X_train, X_val, y_train, y_val = train_test_split(
...     X, y, test_size=0.1, random_state=42)

In [23]:
scaler = StandardScaler()

In [24]:
X_train = scaler.fit_transform(X_train)

In [25]:
X_val=scaler.transform(X_val)

In [26]:
X_test=scaler.transform(X_test)

In [63]:
X_train

array([[-0.09672274, -0.93997411, -1.18992743, ...,  1.33862728,
        -0.49309012,  0.49309012],
       [ 0.97604064,  1.17943438, -0.51086544, ..., -1.71263954,
        -0.49309012,  0.49309012],
       [ 1.48829624,  1.3580651 , -0.51086544, ..., -0.34392549,
        -0.49309012,  0.49309012],
       ...,
       [-1.62057881, -0.31232488,  1.52632054, ...,  1.22378599,
        -0.49309012,  0.49309012],
       [ 0.327114  ,  2.2740937 , -1.18992743, ..., -1.07569465,
        -0.49309012,  0.49309012],
       [ 1.35201643, -0.18329953, -0.51086544, ...,  0.70040385,
        -0.49309012,  0.49309012]])

# PCA

In [42]:
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [45]:
steps = [('pca', PCA(n_components=15)), ('m', LogisticRegression())]
pca_model = Pipeline(steps=steps)

In [46]:
pca_model.fit(X_train, y_train)

Pipeline(steps=[('pca', PCA(n_components=15)), ('m', LogisticRegression())])

In [47]:
result = cross_val_score(pca_model, X_train, y_train, scoring= 'roc_auc', cv= kf)

In [49]:
np.mean(result)

0.5884466402871393

In [76]:
y_val_pred_pca = pca_model.predict(X_val)

In [77]:
accuracy_score(y_val, y_val_pred_pca)

0.7993978170869401

In [50]:
y_pred_pca = pca_model.predict_proba(X_test)[:, 1]

In [52]:
submission_5 = pd.DataFrame({"id": test_original["id"],
                            "failure": y_pred_pca})
submission_5.to_csv('submission_5.csv', index=False)

In [103]:
pca = PCA(n_components = 15)
pca.fit(X_train)
pca.fit(X_val)
pca.fit(X_test)

PCA(n_components=15)

In [93]:
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [88]:
lr=LogisticRegression()

In [89]:
lr.fit(X_train_pca,y_train)

LogisticRegression()

In [84]:
result = cross_val_score(lr, X_train_pca, y_train, scoring= 'roc_auc', cv= kf)

In [85]:
np.mean(result)

0.5880776544926627

# LOGISTIC REGRESSION

In [56]:
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy.stats import skew
from sklearn.model_selection import KFold, cross_val_score

import optuna

In [57]:
models = {
    'logistic regression' : LogisticRegression()
}

In [58]:
for name, model in models.items():
    model.fit(X_train,y_train)
    print(f'{name} trained')

logistic regression trained


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
results = {}

kf = KFold(n_splits= 10)

for name, model in models.items():
    result = cross_val_score(model, X_train, y_train, scoring= 'roc_auc', cv= kf)
    results[name] = result

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [60]:
for name, result in results.items():
    print("----------------")
    print(f'{name} : {np.mean(result)}')

----------------
logistic regression : 0.9879846340952637


In [61]:
def lr_objective(trial):
    tol = trial.suggest_loguniform('tol', 1e-7, 0.1)
    C = trial.suggest_float('C', 0, 0.1)
    max_iter = trial.suggest_int('max_iter', 50, 600)

    model = LogisticRegression(
        solver= 'liblinear',
        penalty= 'l1',
        tol= tol,
        C= C,
        max_iter= max_iter
    )

    model.fit(X_train,y_train)
    cv_score = cross_val_score(model, X_train,y_train, scoring= 'roc_auc', cv= kf)

    return np.mean(cv_score)

In [62]:
study = optuna.create_study(direction= 'maximize')
study.optimize(lr_objective, n_trials= 30)

[32m[I 2022-09-01 03:27:26,295][0m A new study created in memory with name: no-name-0cecec87-bf04-4eed-b2c8-57d41c040135[0m
[32m[I 2022-09-01 03:27:27,337][0m Trial 0 finished with value: 1.0 and parameters: {'tol': 0.028445809291130324, 'C': 0.010966219323370231, 'max_iter': 370}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-09-01 03:27:30,617][0m Trial 1 finished with value: 1.0 and parameters: {'tol': 0.000250853910050956, 'C': 0.043557107906093455, 'max_iter': 552}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-09-01 03:27:38,171][0m Trial 2 finished with value: 1.0 and parameters: {'tol': 9.874365180843519e-06, 'C': 0.05975535889362769, 'max_iter': 255}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-09-01 03:27:50,895][0m Trial 3 finished with value: 1.0 and parameters: {'tol': 1.2518231930811648e-06, 'C': 0.05651862319180881, 'max_iter': 282}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-09-01 03:27:59,923][0m Trial 4 finished with value: 1.0 and parame

In [63]:
best_params = study.best_params

In [64]:
lr = LogisticRegression(**best_params)
lr.fit(X_train,y_train)
y_test_pred_lr = lr.predict_proba(X_test)[:, 1]



ValueError: X has 24 features, but LogisticRegression is expecting 25 features as input.

In [None]:
submission_7 = pd.DataFrame({"id": test_original["id"],
                            "failure": y_test_pred_lr})
submission_7.to_csv('submission_7.csv', index=False)

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [38]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

In [None]:
kf = KFold(n_splits= 10)
result=cross_val_score(grid_search_cv, X_train, y_train, scoring= 'roc_auc', cv= kf)

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred_val = clf.predict(X_val)
    print(clf.__class__.__name__, accuracy_score(y_val, y_pred_val))

In [None]:
y_pred_ensemble = voting_clf.predict_proba(X_test)[:, 1]