In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

# Load data
train_data = pd.read_csv('Train_samsung.csv')
test_data = pd.read_csv('Test_samsung_noclass.csv')

def preprocess_data(data):
    # Separate numeric and categorical columns
    numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = data.select_dtypes(include=['object']).columns

    # Handle numeric data
    numeric_imputer = SimpleImputer(strategy='median')
    data[numeric_columns] = numeric_imputer.fit_transform(data[numeric_columns])

    # Handle categorical data
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])

    # Encode categorical variables
    le = LabelEncoder()
    for col in categorical_columns:
        data[col] = le.fit_transform(data[col].astype(str))

    # Feature engineering (example)
    if 'X6' in data.columns and 'X7' in data.columns:
        data['X6_X7_ratio'] = data['X6'] / (data['X7'] + 1)

    return data

# Preprocess data
X_train = preprocess_data(train_data.drop('Class', axis=1))
y_train = train_data['Class']

# Encode the target variable
le = LabelEncoder()
y_train = le.fit_transform(y_train)

X_test = preprocess_data(test_data)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection
selector = SelectKBest(f_classif, k=10)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Model selection and training
models = {
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier()
}

best_model = None
best_score = 0

for name, model in models.items():
    score = np.mean(cross_val_score(model, X_train_selected, y_train, cv=5))
    print(f"{name} CV Score: {score}")
    if score > best_score:
        best_score = score
        best_model = model

# Hyperparameter tuning for the best model
if isinstance(best_model, RandomForestClassifier):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
elif isinstance(best_model, XGBClassifier):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 4, 5],
        'learning_rate': [0.01, 0.1, 0.3]
    }
else:  # LightGBM
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [-1, 10, 20, 30],
        'learning_rate': [0.01, 0.1, 0.3]
    }

grid_search = GridSearchCV(best_model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

# Final prediction
final_model = grid_search.best_estimator_
predictions = final_model.predict(X_test_selected)

# Convert predictions back to original labels
predictions = le.inverse_transform(predictions)

# Save predictions
result = pd.DataFrame({'prediction': predictions})
result.to_csv('improved_predictions.csv', index=False)
print("Improved predictions have been saved to improved_predictions.csv")

RandomForest CV Score: 0.7860441146155431
XGBoost CV Score: 0.7514120799835086
[LightGBM] [Info] Number of positive: 273, number of negative: 119
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 444
[LightGBM] [Info] Number of data points in the train set: 392, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.696429 -> initscore=0.830348
[LightGBM] [Info] Start training from score 0.830348
[LightGBM] [Info] Number of positive: 274, number of negative: 119
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 445
[LightGBM] [Info] Number of data points in the