# Data Pre-processing


In [403]:
# Data Loading

import pandas as pd
import numpy as np

dataset = pd.read_csv('/kaggle/input/playground-series-s3e23/train.csv')
dataset.head()

Unnamed: 0,id,loc,v(g),ev(g),iv(g),n,v,l,d,i,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,0,22.0,3.0,1.0,2.0,60.0,278.63,0.06,19.56,14.25,...,17,1,1,0,16.0,9.0,38.0,22.0,5.0,False
1,1,14.0,2.0,1.0,2.0,32.0,151.27,0.14,7.0,21.11,...,11,0,1,0,11.0,11.0,18.0,14.0,3.0,False
2,2,11.0,2.0,1.0,2.0,45.0,197.65,0.11,8.05,22.76,...,8,0,1,0,12.0,11.0,28.0,17.0,3.0,False
3,3,8.0,1.0,1.0,1.0,23.0,94.01,0.19,5.25,17.86,...,4,0,2,0,8.0,6.0,16.0,7.0,1.0,True
4,4,11.0,2.0,1.0,2.0,17.0,60.94,0.18,5.63,12.44,...,7,0,2,0,7.0,6.0,10.0,10.0,3.0,False


In [404]:
# Dataset info

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101763 entries, 0 to 101762
Data columns (total 23 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 101763 non-null  int64  
 1   loc                101763 non-null  float64
 2   v(g)               101763 non-null  float64
 3   ev(g)              101763 non-null  float64
 4   iv(g)              101763 non-null  float64
 5   n                  101763 non-null  float64
 6   v                  101763 non-null  float64
 7   l                  101763 non-null  float64
 8   d                  101763 non-null  float64
 9   i                  101763 non-null  float64
 10  e                  101763 non-null  float64
 11  b                  101763 non-null  float64
 12  t                  101763 non-null  float64
 13  lOCode             101763 non-null  int64  
 14  lOComment          101763 non-null  int64  
 15  lOBlank            101763 non-null  int64  
 16  lo

In [405]:
dataset['defects'].value_counts()

defects
False    78699
True     23064
Name: count, dtype: int64

In [406]:
# Make it a balanced dataset

defected =  dataset[dataset.defects == True]
not_defected = dataset[dataset.defects == False]
not_defected = not_defected.sample(n = 23064)
dataset = pd.concat([not_defected, defected], axis = 0)
dataset.shape

(46128, 23)

In [407]:
# peak to peak range
np.ptp(dataset)

16846621.12

In [408]:
# The data needs to be standardize

from sklearn.preprocessing import StandardScaler # Z-score normalization

X = dataset.drop(columns = ['defects', 'id'], axis = 1)
y = dataset['defects']

scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

print(f"Peak to Peak range\nBefore Standardizing {np.ptp(X)}\nAfter Standardizing {np.ptp(X_norm)}")

Peak to Peak range
Before Standardizing 16846621.12
After Standardizing 71.64387936912648


In [409]:
# Dimentionality Reduction

from sklearn.decomposition import PCA

# Create a PCA object
pca = PCA(n_components=19)  # Set the number of components you want to retain

# Fit and transform the data
X_pca = pca.fit_transform(X_norm)  # X_scaled is your standardized data


In [410]:
# Explained variance
explained_variance = pca.explained_variance_ratio_

# Principal components (eigenvectors)
principal_components = pca.components_


In [411]:
# Train Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, stratify = y, test_size = 0.2, random_state = 1)

print(X_norm.shape, X_train.shape, X_test.shape)

(46128, 21) (36902, 19) (9226, 19)


# Model Training


In [412]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver = "sag", max_iter = 1000)
model.fit(X_train, y_train)


In [413]:
# Evaluation

from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy : ", acc)
print("Confusion Matrix : ", confusion_matrix)

Accuracy :  0.7223065250379362
Confusion Matrix :  [[3727  886]
 [1676 2937]]


In [414]:
# Test set submission

test_data = pd.read_csv('/kaggle/input/playground-series-s3e23/test.csv')

test=test_data.drop(columns = ['id'], axis = 1)
test.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount
0,33.0,5.0,1.0,4.0,144.0,824.82,0.04,26.96,30.05,22636.74,...,1257.6,30,0,3,0,21.0,23.0,87.0,57.0,9.0
1,27.0,8.0,8.0,2.0,125.0,646.24,0.04,22.82,27.22,14482.46,...,804.58,23,0,2,0,18.0,19.0,70.0,49.0,15.0
2,130.0,11.0,7.0,10.0,545.0,3831.4,0.02,48.15,66.17,116160.08,...,6453.34,99,9,17,1,26.0,53.0,333.0,244.0,21.0
3,65.0,7.0,1.0,7.0,156.0,855.71,0.06,17.23,49.89,16135.47,...,896.42,45,8,10,0,15.0,26.0,88.0,60.0,13.0
4,22.0,3.0,1.0,3.0,52.0,238.42,0.1,9.6,26.7,2624.49,...,145.8,16,0,4,0,12.0,15.0,30.0,24.0,5.0


In [415]:
test_norm = scaler.transform(test)
test_pca = pca.transform(test_norm)

In [416]:
defects =  model.predict_proba(test_pca)[:,1]
defects = [round(defect, 1) for defect in defects]

In [417]:
submission = test_data['id']

In [418]:
defects = list(defects)
submission = list(submission)

In [419]:
test_submission = pd.DataFrame({'id':submission, 'defects':defects})
test_submission.head()

Unnamed: 0,id,defects
0,101763,0.5
1,101764,0.4
2,101765,0.9
3,101766,0.7
4,101767,0.4


In [420]:
test_submission.to_csv('/kaggle/working/submission.csv', index = False)