In [1]:
import random
random.seed(6547459473870189196)

In [2]:
import pandas as pd

df = pd.read_csv("./playground-series-s3e12/train.csv")
df.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [3]:
df.shape

(414, 8)

In [4]:
X = df.iloc[:, 1:-1].values  # İlk sütun ve son sütun hariç bütün sütunlar
y = df.iloc[:, -1].values    # Son sütun

In [5]:
X.shape[1]

6

In [6]:
y.shape

(414,)

In [7]:
# Split dataset

from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
def data_split(X, y, type=random.randint(0, 2)):

    # Traditional Split
    if type == 0:
        print("Traditional splitted with proportion of 60-20-20")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)
        return X_train, X_val, X_test, y_train, y_val, y_test
    
    # K-Fold Split
    elif type == 1:
        print("5-Fold splitted")
        kf = KFold(n_splits=5, shuffle=True)
        splits = list(kf.split(X))
        return splits
    
    # Leave one out split
    elif type == 2:
        print("Leave one out splitted")
        loo = LeaveOneOut()
        splits = list(loo.split(X))
        return splits
    
    else:
        raise ValueError("Invalid value for type parameter. Type must be 0, 1 or 2.")



In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

def outlier_removal(X, y, type=random.randint(0, 2)):
    if type == 0:
        # Z-score method
        z_scores = np.abs(StandardScaler().fit_transform(X))
        threshold = 3
        rows, cols = np.where(z_scores > threshold)
        indices = np.unique(rows)
        X_new = np.delete(X, indices, axis=0)
        y_new = np.delete(y, indices, axis=0)
        print(f"Z-score method: Removed {len(X) - len(X_new)} outliers")
    
    elif type == 1:
        # Isolation Forest method
        clf = IsolationForest().fit(X)
        y_pred = clf.predict(X)
        indices = np.where(y_pred == -1)
        X_new = np.delete(X, indices, axis=0)
        y_new = np.delete(y, indices, axis=0)
        print(f"Isolation Forest method: Removed {len(X) - len(X_new)} outliers")
    
    elif type == 2:
        # Local Outlier Factor method
        clf = LocalOutlierFactor(n_neighbors=20, contamination='auto')
        y_pred = clf.fit_predict(X)
        indices = np.where(y_pred == -1)
        X_new = np.delete(X, indices, axis=0)
        y_new = np.delete(y, indices, axis=0)
        print(f"LOF method: Removed {len(X) - len(X_new)} outliers")
    
    return X_new, y_new

In [12]:
X_new, y_new = outlier_removal(X, y, 0)

Z-score method: Removed 14 outliers


In [13]:
missing_values_count = df.isna().sum()
print("Sütundaki toplam eksik değer sayısı:", missing_values_count)

Sütundaki toplam eksik değer sayısı: id         0
gravity    0
ph         0
osmo       0
cond       0
urea       0
calc       0
target     0
dtype: int64


In [14]:
X_train, X_val, X_test, y_train, y_val, y_test = data_split(X_new, y_new, type=0)

Traditional splitted with proportion of 60-20-20


In [16]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer

def scale_data(X_train, X_val, X_test, type=random.randint(0,4)):
    if type == 0:
        scaler = StandardScaler()
        print("StandardScaler selected.")
    elif type == 1:
        scaler = MinMaxScaler()
        print("MinMaxScaler selected.")
    elif type == 2:
        scaler = RobustScaler()
        print("RobustScaler selected.")
    elif type == 3:
        scaler = QuantileTransformer(output_distribution='normal')
        print("QuantileTransformer selected.")
    elif type == 4:
        scaler = PowerTransformer(method='yeo-johnson')
        print("PowerTransformer selected.")
    else:
        raise ValueError("Invalid type parameter")
        
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_val_scaled, X_test_scaled

In [17]:
X_train_scaled, X_val_scaled, X_test_scaled = scale_data(X_train, X_val, X_test, type=0)

StandardScaler selected.


In [18]:
# Handle Categorical features

df.dtypes

id           int64
gravity    float64
ph         float64
osmo         int64
cond       float64
urea         int64
calc       float64
target       int64
dtype: object

In [None]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFECV

def select_feature(X, y, type=random.randint(0, 5)):
    # type 0: Varyans Eşikleme Yöntemi
    if type == 0:
        sel = VarianceThreshold(threshold=(.8 * (1 - .8))) # Varyans eşikleme nesnesi oluştur
        X_new = sel.fit_transform(X) # Varyans eşikleme uygula ve öznitelikleri yeni bir değişkene atayın
        print(f"VarianceThreshold: Orjinal şekil = {X.shape}, Yeni şekil = {X_new.shape}")
    
    # type 1: SelectKBest Yöntemi
    elif type == 1:
        X_new = SelectKBest(f_classif, k=2).fit_transform(X, y) # SelectKBest ile öznitelik seçme
        print(f"SelectKBest: Orjinal şekil = {X.shape}, Yeni şekil = {X_new.shape}")

    # type 2: Recursive Feature Elimination with Cross-Validation (RFECV)
    elif type == 2:
        svc = LinearSVC(C=0.1, penalty="l1", dual=False) # LinearSVC sınıflandırıcısı oluştur
        rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy') # RFECV nesnesi oluştur
        X_new = rfecv.fit_transform(X, y) # RFECV uygula ve öznitelikleri yeni bir değişkene atayın
        print(f"RFECV: Orjinal şekil = {X.shape}, Yeni şekil = {X_new.shape}")

    # type 3: SelectFromModel with Linear Support Vector Classification (LinearSVC)
    elif type == 3:
        lsvc = LinearSVC(C=1, penalty="l1", dual=False).fit(X, y) # LinearSVC sınıflandırıcısı oluştur ve eğitin
        model = SelectFromModel(lsvc, prefit=True) # SelectFromModel nesnesi oluştur
        X_new = model.transform(X) # SelectFromModel'i uygulayın ve öznitelikleri yeni bir değişkene atayın
        print(f"SelectFromModel with LinearSVC: Orjinal şekil = {X.shape}, Yeni şekil = {X_new.shape}")

    # type 4: SelectFromModel with ExtraTreesClassifier
    elif type == 4:
        clf = ExtraTreesClassifier(n_estimators=50) # ExtraTreesClassifier oluşturun
        clf = clf.fit(X, y) # ExtraTreesClassifier'i eğitin
        model = SelectFromModel(clf, prefit=True) # SelectFromModel nesnesi oluştur
        X_new = model.transform(X) #
        print(f"SelectFromModel with ExtraTreesClassifier: Orjinal şekil = {X.shape}, Yeni şekil = {X_new.shape}")

    # brute force

    return X_new, y

In [None]:
X_new, y_new = select_feature(X, y)

In [None]:
from imblearn.over_sampling import SMOTE

def generate_data(X, y):
    sm = SMOTE()
    X_res, y_res = sm.fit_resample(X, y)
    print(f"Orjinal şekil = {X.shape}, Yeni şekil = {X_res.shape}")
    return X_res, y_res

In [None]:
X, y = generate_data(X, y)

In [21]:
X_train_scaled.shape

(240, 6)

In [22]:
y_train

array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
      dtype=int64)

In [23]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics

  from pandas import MultiIndex, Int64Index


In [24]:
model = XGBClassifier()
model.fit(X_train_scaled, y_train)





In [25]:
preds = model.predict(X_test_scaled)
accuracy_score(y_test, preds)

0.6875

In [26]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, preds)

0.6777777777777778

In [27]:
test_csv = pd.read_csv("./playground-series-s3e12/test.csv")

In [28]:
test_csv

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc
0,414,1.017,5.24,345,11.5,152,1.16
1,415,1.020,5.68,874,29.0,385,3.46
2,416,1.024,5.36,698,19.5,354,13.00
3,417,1.020,5.33,668,25.3,252,3.46
4,418,1.011,5.87,567,29.0,457,2.36
...,...,...,...,...,...,...,...
271,685,1.029,6.27,853,21.4,364,7.31
272,686,1.012,5.62,410,14.0,195,1.45
273,687,1.031,5.68,874,29.0,380,4.49
274,688,1.019,5.47,543,21.4,170,1.64
