In [1]:
!pip install -U imbalanced-learn
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from scipy.stats import skew, kurtosis





In [3]:
train = pd.read_csv("hacktrain.csv")
test = pd.read_csv("hacktest.csv")

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   object 
 3   20150720_N  7440 non-null   float64
 4   20150602_N  6800 non-null   float64
 5   20150517_N  7200 non-null   float64
 6   20150501_N  7040 non-null   float64
 7   20150415_N  7520 non-null   float64
 8   20150330_N  6880 non-null   float64
 9   20150314_N  7280 non-null   float64
 10  20150226_N  6640 non-null   float64
 11  20150210_N  7360 non-null   float64
 12  20150125_N  6960 non-null   float64
 13  20150109_N  7120 non-null   float64
 14  20141117_N  6720 non-null   float64
 15  20141101_N  7600 non-null   float64
 16  20141016_N  6560 non-null   float64
 17  20140930_N  7200 non-null   float64
 18  20140813_N  7440 non-null   float64
 19  20140626_N  6400 non-null  

In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,3,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,4,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,7,8,water,1136.44,,,1647.83,1935.8,,2158.98,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [6]:
X_train_raw = train.drop(columns=["ID", "Unnamed: 0", "class"])
y_train = train["class"]
X_test_raw = test.drop(columns=["ID", "Unnamed: 0"])
test_ids = test["ID"]

In [7]:
imputer = KNNImputer(n_neighbors=5)
X_train_imputed = imputer.fit_transform(X_train_raw)
X_test_imputed = imputer.transform(X_test_raw)


In [8]:
def extract_features(data):
    df = pd.DataFrame(data)
    features = pd.DataFrame()

    # Basic stats
    features["mean"] = df.mean(axis=1)
    features["std"] = df.std(axis=1)
    features["max"] = df.max(axis=1)
    features["min"] = df.min(axis=1)
    features["range"] = features["max"] - features["min"]
    features["median"] = df.median(axis=1)

    # Trend & shape
    features["slope"] = df.apply(lambda row: np.polyfit(range(len(row)), row, 1)[0], axis=1)
    features["skew"] = df.apply(skew, axis=1)
    features["kurtosis"] = df.apply(kurtosis, axis=1)

    # Seasonal mean
    features["first_half_mean"] = df.iloc[:, :14].mean(axis=1)
    features["second_half_mean"] = df.iloc[:, 14:].mean(axis=1)

    # Frequency (Fourier)
    def fft_energy(row):
        fft_vals = np.fft.fft(row)
        return np.abs(fft_vals[1]) + np.abs(fft_vals[2])
    features["fft_energy"] = df.apply(fft_energy, axis=1)

    return features


In [9]:
X_train_feat = extract_features(X_train_imputed)
X_test_feat = extract_features(X_test_imputed)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_feat)
X_test_scaled = scaler.transform(X_test_feat)

In [11]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)


In [12]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_balanced)
X_test_poly = poly.transform(X_test_scaled)


In [13]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_balanced), y=y_train_balanced)
weight_dict = dict(zip(np.unique(y_train_balanced), class_weights))

model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial', class_weight=weight_dict)
model.fit(X_train_poly, y_train_balanced)




In [14]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_poly, y_train_balanced, cv=cv, scoring='accuracy')
print(f"Cross-validated Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")





Cross-validated Accuracy: 0.8575 ± 0.0020


In [15]:
test_preds = model.predict(X_test_poly)
submission = pd.DataFrame({
    "ID": test_ids,
    "class": test_preds
})
submission.to_csv("submission_advanced.csv", index=False)
print(" submission_advanced.csv'")

 submission_advanced.csv'
