In [41]:
import numpy as np 
import pandas as pd 

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor
import catboost as cb
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

deal with the missing data and fill-in the NAN values

In [42]:
# some columns in the train.csv and test.csv has the missing data
# we have to deal with the missing data to get the better performance of the model
# split: the size of the val data
def dealing_with_missingData(T, NAN_var, target, split):
    # a shallow copy of the table(the csv data)
    missingData = T.copy() 

    # find the position that is null value
    missingTarget = missingData[missingData[NAN_var].isnull()] 
    # use isnull().sum(axis=1) can get the number of the NAN data
    missingData = missingData[(missingData.isnull().sum(axis=1) <1)]

    # drop the column that has nan values
    x_missing = missingData.drop([NAN_var, target], axis=1) 
    # do the same thing for the label
    y_missing = missingData[NAN_var]
    
    # then we split the data to train and test
    x_train, x_val, y_train, y_val = train_test_split(x_missing, y_missing, test_size=split, random_state=42, shuffle=True)
    
    # then return the result
    return x_train, x_val, y_train, y_val

In [43]:
# as the function name stating, we fill the NAN position with some value
def fillNAN(T, T_NAN, model, NAN_var, target):
    # first we get the columns with nan values
    missing_target = T[T[NAN_var].isnull()]
    miss_train = missing_target.drop([NAN_var, target], axis=1)
    pred = model.predict(miss_train)
    
    # isna() is used to check the missing value
    nans = T[NAN_var].isna()
    T_NAN.loc[nans, NAN_var] = pred
    
    return T_NAN

load the csv data

In [44]:
# load the csv data train.csv and test.csv

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [45]:
# creating target variable for test data and fill in it with some value
test['failure'] = 2
data = train.append(test)

# we can use .info() to see the non-null count
# data.info()

train = train.drop(['id'], axis=1)
data = data.drop(['id'], axis=1)

In [46]:
# a shallow copy
df = data.copy()

# product_code has A, B, C... types of category
# cat. codes is for categorical data and get_dummies is for object
df['product_code'] = df['product_code'].astype('category').cat.codes
df['attribute_0'] = df['attribute_0'].astype('category').cat.codes

# create the column calculating by multiply attribute_2 and attribute_3
df['atr_2 * atr3'] = df['attribute_2'] * df['attribute_3']

# use drop function to drop some columns
# these column has some missing data, and the content of the data should be modify
df = df.drop(['attribute_1', 'attribute_2', 'attribute_3', 'measurement_2', 'measurement_8', 'measurement_12', 'measurement_15', 'measurement_16'], axis=1)
# we can use .info() to see the non-null count
# df.info()

# a shallow copy
df_nan = df.copy()

dealing with measurement_4 column

In [47]:
# deal with measurement_4 column
x_train, x_val, y_train, y_val = dealing_with_missingData(df, 'measurement_4', 'failure', 0.01)

In [48]:
# use the LinearRegression model
reg_measurement4 = LinearRegression().fit(x_train, y_train)
reg_measurement4.score(x_train, y_train)
pred_reg = reg_measurement4.predict(x_val)

# get the mean squared error
mean_squared_error(y_val, pred_reg)

0.954559257221399

In [49]:
# use CatBoostRegressor model
cat_measurement4 = cb.CatBoostRegressor(verbose=False, learning_rate = 0.05)
cat_measurement4.fit(x_train, y_train)
pred_cat = cat_measurement4.predict(x_val)

# get the mean squared error
mean_squared_error(y_val, pred_cat)

0.691084406719754

In [50]:
# fill in the NAN values
# the mean square error of CatBoostRegressor is smaller, so use CatBoostRegressor model 
df_nan = fillNAN(df, df_nan, cat_measurement4, 'measurement_4', 'failure')

dealing with measurement_11 column

In [51]:
x_train, x_val, y_train, y_val = dealing_with_missingData(df, 'measurement_11', 'failure', 0.08)

In [52]:
reg_measurement11 = LinearRegression().fit(x_train, y_train)
reg_measurement11.score(x_train, y_train)
pred_reg = reg_measurement11.predict(x_val)

mean_squared_error(y_val, pred_reg)

2.292342662988236

In [53]:
cat_measurement11 = cb.CatBoostRegressor(verbose=False, learning_rate = 0.04)
cat_measurement11.fit(x_train, y_train)
pred_cat = cat_measurement11.predict(x_val)

mean_squared_error(y_val, pred_cat)

2.079140306426748

In [54]:
# the mean square error of CatBoostRegressor is smaller, so use CatBoostRegressor model 
df_nan = fillNAN(df, df_nan, cat_measurement11, 'measurement_11', 'failure')

dealing with measurement_5 column

In [55]:
x_train, x_val, y_train, y_val = dealing_with_missingData(df, 'measurement_5', 'failure', 0.008)

In [56]:
reg_measurement5 = LinearRegression().fit(x_train, y_train)
reg_measurement5.score(x_train, y_train)
pred_reg = reg_measurement5.predict(x_val)

mean_squared_error(y_val, pred_reg)

0.8272093299077914

In [57]:
cat_measurement5 = cb.CatBoostRegressor(verbose=False, learning_rate = 0.046)
cat_measurement5.fit(x_train, y_train)
pred_cat = cat_measurement5.predict(x_val)

mean_squared_error(y_val, pred_cat)

0.5593132698297004

In [58]:
# the mean square error of CatBoostRegressor is smaller, so use CatBoostRegressor model 
df_nan = fillNAN(df, df_nan, cat_measurement5, 'measurement_5', 'failure')

dealing with measurement_7 column

In [59]:
x_train, x_val, y_train, y_val = dealing_with_missingData(df, 'measurement_7', 'failure', 0.009)

In [60]:
reg_measurement7 = LinearRegression().fit(x_train, y_train)
reg_measurement7.score(x_train, y_train)
pred_reg = reg_measurement7.predict(x_val)

mean_squared_error(y_val, pred_reg)

0.7438138496406586

In [61]:
cat_measurement7 = cb.CatBoostRegressor(verbose=False, learning_rate = 0.032)
cat_measurement7.fit(x_train, y_train)
pred_cat = cat_measurement7.predict(x_val)

mean_squared_error(y_val, pred_cat)

0.556530797206808

In [62]:
# the mean square error of CatBoostRegressor is smaller, so use CatBoostRegressor model 
df_nan = fillNAN(df, df_nan, cat_measurement7, 'measurement_7', 'failure')

dealing with measurement_14 column

In [63]:
x_train, x_val, y_train, y_val = dealing_with_missingData(df, 'measurement_14', 'failure', 0.015)

In [64]:
reg_measurement14 = LinearRegression().fit(x_train, y_train)
reg_measurement14.score(x_train, y_train)
pred_reg = reg_measurement14.predict(x_val)

mean_squared_error(y_val, pred_reg)

2.277716491015243

In [65]:
# here we use LGBMRegressor to deal with measurement14 column
lite_measurement14 = LGBMRegressor(min_data_in_leaf=8, num_iterations=80)
lite_measurement14.fit(x_train, y_train)
pred_lite = lite_measurement14.predict(x_val)

mean_squared_error(y_val, pred_lite)





2.0991118621924887

In [66]:
# the mean square error of LGBMRegressor is smaller, so use LGBMRegressor model 
df_nan = fillNAN(df, df_nan, lite_measurement14, 'measurement_14', 'failure')

dealing with measurement_17 column

In [67]:
x_train, x_val, y_train, y_val = dealing_with_missingData(df, 'measurement_17', 'failure', 0.025)

In [68]:
reg_measurement17 = LinearRegression().fit(x_train, y_train)
reg_measurement17.score(x_train, y_train)
pred_reg = reg_measurement17.predict(x_val)

mean_squared_error(y_val, pred_reg)

8554.058551899949

In [69]:
cat_measurement17 = cb.CatBoostRegressor(verbose=False, learning_rate = 0.042)
cat_measurement17.fit(x_train, y_train)
pred_cat = cat_measurement17.predict(x_val)

mean_squared_error(y_val, pred_cat)

4059.609139912901

In [70]:
# the mean square error of CatBoostRegressor is smaller, so use CatBoostRegressor model 
df_nan = fillNAN(df, df_nan, cat_measurement17, 'measurement_17', 'failure')

In [71]:
df = df_nan

# we can use .info() to see the non-null count
# df.info()

processing for more feature engineering

In [72]:
# df.isnull().sum(axis=1) is used to calculate all the missing value of columns
df = df[(df.isnull().sum(axis=1) <= 1) | (df.failure == 2)]

df = df[(df['loading'] < 300) | (df['failure'] == 2)]
df = df[(df['measurement_1'] < 27) | (df['failure'] == 2)]

# create some columns
df['loading * mesh6'] = ((df['loading'] - df['loading'].min()) / df['loading'].max()) * ((df['measurement_6'] - df['measurement_6'].min()) / df['measurement_6'].max())
df['loading * mesh6'] = df['measurement_17'] * df['loading * mesh6']

In [73]:
# get the failure that is not equal to 2
y = df[df.failure != 2].failure.values
# test_df get the failure that is equal to 2
test_df = df[df.failure == 2]
# get the column that the failure is not equal to 2 and drop the failure and product_code column
X_res = df[df.failure != 2].drop(['failure',  'product_code'], axis=1)

# drop the failure column and product_code(failure column is the label and product_code is not important for training)
x_test = test_df.drop(['failure', 'product_code'], axis=1)

# import SimpleImputer
# SimpleImputer can fill in the missing value
from sklearn.impute import SimpleImputer
# the strategy is mean represents that we are going to use mean to fill in the missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X_res)
  
X = imputer.transform(X_res)

# same for the test data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(x_test)
X_test = imputer.transform(x_test)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.01, random_state=42, shuffle=True, stratify= y)

In [74]:
# mutual_info_classif is used to Estimate mutual information for a discrete target variable.
from sklearn.feature_selection import mutual_info_classif
 
X_T = pd.DataFrame(data=X, columns = X_res.columns)

MI_score = mutual_info_classif(X_T, y)
MI_score = pd.Series(MI_score, name="MI Scores", index=X_T.columns)

# sort the values of ascending order
MI_score = MI_score.sort_values(ascending=False)

In [75]:
model = LGBMRegressor(max_depth = 15, num_leaves = 2, min_data_in_leaf = 450, 
                        num_iterations = 50, feature_fraction = 0.8,
                        bagging_freq = 7, bagging_fraction = 0.6,
                        learning_rate = 0.15
                    )

Training

In [76]:
model.fit(X_train, y_train)





LGBMRegressor(bagging_fraction=0.6, bagging_freq=7, feature_fraction=0.8,
              learning_rate=0.15, max_depth=15, min_data_in_leaf=450,
              num_iterations=50, num_leaves=2)

Evaluation

In [77]:
y_light_valid = model.predict(X_val)

roc_score_dummy = roc_auc_score(y_val, y_light_valid)
print(f"ROC Score = {roc_score_dummy : .4f}")

ROC Score =  0.6822


In [78]:
y_light_train = model.predict(X_train)

roc_score_dummy = roc_auc_score(y_train, y_light_train)
print(f"ROC Score = {roc_score_dummy : .4f}")

ROC Score =  0.5953
