In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer # Imputation
from sklearn.preprocessing import QuantileTransformer # Scaling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score, precision_recall_curve


# Deep Learning 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/tabular-playground-series-aug-2022'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Data

In [2]:
df_test = pd.read_csv("../input/tabular-playground-series-aug-2022/test.csv")
df_test.head()

In [3]:
df_train = pd.read_csv("../input/tabular-playground-series-aug-2022/train.csv")
df_train.head()

In [4]:
df_train.shape, df_test.shape

In [5]:
df_train.set_index("id")

### Data Type

In [6]:
df_train.dtypes

In [7]:
df_test.dtypes

### Basic stats of data

In [8]:
df_train.describe()

In [9]:
df_test.describe()

In [10]:
df_train.describe(include="all")

In [11]:
df_test.describe(include="all")

In [12]:
df_train.info()

## Data Preparation
* Checking of missing values
* Imputation: Sampling from Simulated Data Distribution
* Labelling Data
* Features Engineering

In [13]:
# Missing values in Train data
percent_miss_train = df_train.isnull().sum()/len(df_train) * 100
num_miss_train = df_train.isnull().sum()
df_miss = pd.DataFrame({"sum_missing_value":num_miss_train,
                        "Percent_missing_value(%)":percent_miss_train})
df_miss

In [14]:
# Missing values in Test data
percent_miss_test = df_test.isnull().sum()/len(df_test) * 100
num_miss_test = df_test.isnull().sum()
df_miss_test = pd.DataFrame({"sum_missing_value":num_miss_test,
                        "Percent_missing_value(%)":percent_miss_test})
df_miss_test

### Numerical features

In [15]:
# drop "failure" column
df_train_feature = df_train.drop(["failure"], axis=1)
df_train_feature.columns

In [16]:
# Join train and test data
df_join = pd.concat([df_train_feature, df_test], axis=0)
df_join.shape

In [17]:
df_join.dtypes

In [18]:
# Missing values in all data
percent_miss_all = df_join.isnull().sum()/len(df_test) * 100
num_miss_all = df_join.isnull().sum()
df_miss_all = pd.DataFrame({"sum_missing_value":num_miss_all,
                        "Percent_missing_value(%)":percent_miss_all})
df_miss_all

In [19]:
df_join = df_join.set_index("id")
df_join.head()

In [20]:
df_join = df_join.drop(["product_code"],axis=1)
df_join.head()

In [21]:
df_join1 = df_join.copy()

In [22]:
# Numeric feature
feature = [col for col in df_join.columns if col == "loading" or col.startswith("measurement_")]
feature

### Visualize on numeric data

In [23]:
plt.figure(figsize= (25,22))
feature = [col for col in df_join.columns if col == "loading" or col.startswith("measurement_")]
for i, col in enumerate(feature):
    plt.subplot(5,4,i+1)
    sns.histplot(data=df_train,
                x=df_train[col],
                hue=df_train["failure"],
                )
    plt.title(col)
    plt.xlabel(" ")
    plt.ylabel(" ")

In [24]:
# Look closer to loading feature
sns.histplot(data=df_join1,
             x=df_join1["loading"],
             kde=True)

In [25]:
# Log-transformation 
df_join1["loading"] = np.log(df_join1["loading"])
sns.histplot(data=df_join1,
             x=df_join1["loading"],
             kde=True)

### Scaling data and KNN imputation 

In [26]:
# Scaling data
scaler = QuantileTransformer()

df_join1[feature] = scaler.fit_transform(df_join1[feature])
df_join1.head()

In [27]:
knn_imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
df_join1[feature] = knn_imputer.fit_transform(df_join1[feature])
df_join1.isnull().sum()

In [28]:
plt.figure(figsize= (25,22))
feature = [col for col in df_join.columns if col == "loading" or col.startswith("measurement_")]
for i, col in enumerate(feature):
    plt.subplot(5,4,i+1)
    sns.histplot(data=df_join1,
                x=df_join1[col],
                hue=df_train["failure"],
                )
    plt.title(col)
    plt.xlabel(" ")
    plt.ylabel(" ")

### Categorical features
* attribute_0 - attribute_3 columns

In [29]:
# Column atribute_0
print("Data in attribute_0 of train data: ",df_train["attribute_0"].unique())
print("Data in attribute_0 of test data: ",df_test["attribute_0"].unique())

In [30]:
df_train_att0 = df_train["attribute_0"].value_counts()
df_train_att0

In [31]:
df_test_att0 = df_test["attribute_0"].value_counts()
df_test_att0

In [32]:
pd.concat([df_train_att0, df_test_att0], keys=["train", "test"],axis=0)

In [33]:
# Column atribute_1
print("Data in attribute_1 of train data: ",df_train["attribute_1"].unique())
print("Data in attribute_1 of test data: ",df_test["attribute_1"].unique())

In [34]:
df_train_att1 = df_train["attribute_1"].value_counts()
df_test_att1 = df_test["attribute_1"].value_counts()
pd.concat([df_train_att1, df_test_att1], keys=["train", "test"], axis=0)

In [35]:
# Column atribute_2
print("Data in attribute_2 of train data: ",df_train["attribute_2"].unique())
print("Data in attribute_2 of test data: ",df_test["attribute_2"].unique())

In [36]:
df_train_att2 = df_train["attribute_2"].value_counts()
df_test_att2 = df_test["attribute_2"].value_counts()
pd.concat([df_train_att2, df_test_att2], keys=["train", "test"], axis=0)

In [37]:
# Column atribute_3
print("Data in attribute_3 of train data: ",df_train["attribute_3"].unique())
print("Data in attribute_3 of test data: ",df_test["attribute_3"].unique())

In [38]:
df_train_att3 = df_train["attribute_3"].value_counts()
df_test_att3 = df_test["attribute_3"].value_counts()
pd.concat([df_train_att3, df_test_att3], keys=["train", "test"], axis=0)

In [39]:
df_train.groupby(["attribute_2","failure"])["attribute_2"].count()

In [40]:
# Label incoding on attribute_0 and attribute_1
dummy_var = pd.get_dummies(df_join1[["attribute_0", "attribute_1"]])
dummy_var

In [41]:
df_join1 = pd.concat([df_join1, dummy_var], axis=1)
df_join1.drop(["attribute_0", "attribute_1"], axis=1, inplace=True)

In [42]:
# Label incoding on attribute_2 and attribute_3
le = preprocessing.LabelEncoder()
le_fit = le.fit(df_join1["attribute_2"])
df_join1["attribute_2"] = le_fit.transform(df_join1["attribute_2"])
df_join1["attribute_2"].unique()

In [43]:
le_fit2 = le.fit(df_join["attribute_3"])
df_join1["attribute_3"] = le_fit.transform(df_join1["attribute_3"])
df_join1["attribute_3"].unique()

In [44]:
df_join1.head()

<p>from https://www.kaggle.com/code/ambrosm/tpsaug22-eda-which-makes-sense</p>
<p>discovered that the missing of measurement_3 and measurement_5</p>
<p>affected to the failure rate </p>
<p>Therefore, we have to create additional columns namely, </p>
<p>"m_3_missing" and "m_5_missing"</p>

In [45]:
df_join1['m_3_missing']=df_join1.measurement_3.isna().astype(int)
df_join1['m_5_missing']=df_join1.measurement_5.isna().astype(int)
df_join1.head()

### Checking of outlier

In [46]:
plt.figure(figsize= (30,30))
for i, col in enumerate(feature):
    plt.subplot(10,2,i+1)
    sns.boxplot(data=df_join1,
                x=df_join1[col])
    plt.title(col)

In [47]:
df_join1.head()

In [48]:
df_train_clean = df_join1[:len(df_train)]
df_test_clean = df_join1[len(df_train):]
df_train_clean.shape, df_test_clean.shape

In [49]:
df_train_clean = pd.concat([df_train_clean, df_train["failure"]], axis=1)
df_train_clean.head()

In [50]:
df_test_clean

In [51]:
# df_train_clean.to_csv("clean_train.csv", index=False)
# df_test_clean.to_csv("clean_test.csv", index=False)

In [52]:
# import os
# os.remove("./clean_test.csv")
# os.remove("./clean_train.csv")

In [53]:
y = df_train_clean.failure

In [54]:
y

In [55]:
X = df_train_clean.drop("failure", axis=1)

In [56]:
X

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    test_size=0.1,
                                                    random_state=42)

In [61]:
# 20%
X_train80, X_test20, y_train80, y_test80 = train_test_split(X, y,
                                                    stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)

### Find optimal wieght class

In [58]:
lr = LogisticRegression(solver='newton-cg')

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train, y_train)

#Ploting the score for different values of weight
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)

In [59]:
def conf_matrix(y_test, pred_test):    
    
    # Creating a confusion matrix
    con_mat = confusion_matrix(y_test, pred_test)
    con_mat = pd.DataFrame(con_mat, range(2), range(2))
   
    #Ploting the confusion matrix
    plt.figure(figsize=(6,6))
    sns.set(font_scale=1.5) 
    sns.heatmap(con_mat, annot=True, annot_kws={"size": 16}, fmt='g', cmap='Blues', cbar=False)

In [60]:
class_weight={0: 0.16417085427135678, 1: 0.8358291457286432}

lr = LogisticRegression(solver='newton-cg', class_weight=class_weight)
lr.fit(X_train, y_train)

# Predicting on the test data
pred_test = lr.predict(X_test)

#Calculating and printing the f1 score 
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

#Ploting the confusion matrix
conf_matrix(y_test, pred_test)

In [62]:
# train on 80% train set
lr = LogisticRegression(solver='newton-cg')

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train80, y_train80)

#Ploting the score for different values of weight
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)

In [None]:
rf = RandomForestClassifier(n_estimators=40)

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= rf, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train, y_train)

#Ploting the score for different values of weight
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)

In [108]:
rf = RandomForestClassifier(n_estimators=40, class_weight=class_weight)
rf.fit(X_train, y_train)

# Predicting on the test data
pred_test = rf.predict(X_test)

#Calculating and printing the f1 score 
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

#Ploting the confusion matrix
conf_matrix(y_test, pred_test)


### Deep Learning Model

In [113]:
# Create the model
model = Sequential([
  Dense(10),
  Activation("relu"),
  Dense(10),
  Activation("relu"),
  Dense(10),
  Activation("relu"),
  Dense(1),
  Activation("sigmoid")
])

es = EarlyStopping(monitor="val_loss",patience=15)

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam(),
                 metrics=["accuracy"])

history = model.fit(X_train,
                    y_train,
                    epochs=100,
                    class_weight=class_weight,
                    validation_data=(X_test, y_test))

In [115]:
y_probs = model.predict(X_test)
y_preds = y_probs.argmax(axis=-1)

#Calculating and printing the f1 score 
f1_test = f1_score(y_test, y_preds)
print('The f1 score for the testing data:', f1_test)

#Ploting the confusion matrix
conf_matrix(y_test, y_preds)

In [131]:
class_weight={0: 0.16417085427135678, 1: 0.8358291457286432}

# Create the model
model2 = Sequential([
  Dense(512, activation="relu"),
  Dropout(0.5),
  Dense(256, activation="relu"),
  Dropout(0.4),
  Dense(128, activation="relu"),
  Dropout(0.3),
  Dense(64, activation="relu"),
  Dropout(0.1),
  Dense(32, activation="relu"),
  Dense(1, activation="sigmoid")
])

es = EarlyStopping(monitor="val_loss",patience=15)

model2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam(),
                 metrics=[tf.keras.metrics.AUC()])

history2 = model2.fit(X_train,
                    y_train,
                    epochs=100,
                    class_weight=class_weight,
                    validation_data=(X_test, y_test))
