In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Assuming 'event' is the target variable and 'step', 'timestamp' are features
file_path = '/kaggle/input/simple-clean/train_events.csv'

# Load the dataset
df = pd.read_csv(file_path)

# Extract features and target variable
y = df['event']
X = df.drop(['series_id', 'night', 'event'], axis=1)

# Handling missing values
imputer = SimpleImputer(strategy='constant', fill_value=0)
X[['step']] = imputer.fit_transform(X[['step']])

# Convert the 'timestamp' column to datetime format
X['timestamp'] = pd.to_datetime(X['timestamp'], errors='coerce', utc=True)
X['timestamp'].fillna(pd.to_datetime('1970-01-01T00:00:00+00:00', utc=True), inplace=True)
X['timestamp'] = X['timestamp'].astype(int) / 10**9

# Encode the target variable using the LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Handling Imbalance using SMOTE for oversampling and RandomUnderSampler for undersampling
smote = SMOTE(random_state=42)
rus = RandomUnderSampler(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize classifiers
rf_classifier = RandomForestClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV for Random Forest classifier
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15]
}

grid_search = GridSearchCV(rf_classifier, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_rf_classifier = grid_search.best_estimator_

# Train classifiers
classifiers = [best_rf_classifier, xgb_classifier, gb_classifier]

# Print columns in the training set
print("Columns in the training set:", X.columns)

# Load the training dataset for making predictions
train_file_path = '/kaggle/input/simple-clean/train_events.csv'
df_train_for_predictions = pd.read_csv(train_file_path)

# Extract features from the training dataset
X_train_for_predictions = df_train_for_predictions.drop(['series_id', 'night', 'event'], axis=1)

# Handling missing values for the training set used for predictions
X_train_for_predictions[['step']] = imputer.transform(X_train_for_predictions[['step']])

# Convert the 'timestamp' column to datetime format
X_train_for_predictions['timestamp'] = pd.to_datetime(X_train_for_predictions['timestamp'], errors='coerce', utc=True)
X_train_for_predictions['timestamp'].fillna(pd.to_datetime('1970-01-01T00:00:00+00:00', utc=True), inplace=True)
X_train_for_predictions['timestamp'] = X_train_for_predictions['timestamp'].astype(int) / 10**9

# Subset the training set used for predictions to include only relevant columns
X_train_subset = X_train_for_predictions[X.columns]

# Make predictions with individual models on the training data
for clf in classifiers:
    # Fit the classifier
    clf.fit(X_resampled, y_resampled)

    # Make predictions on the training set
    y_pred_train = clf.predict(X_train_subset)

    # Now you can use y_pred_train for further analysis
    # For example, if you want to print the predictions:
    print(f'Training Predictions using {clf.__class__.__name__}:\n{y_pred_train}')

# Ensemble classifier using VotingClassifier
ensemble_classifier = VotingClassifier(estimators=[
    ('rf', best_rf_classifier),
    ('xgb', xgb_classifier),
    ('gb', gb_classifier)
], voting='soft')

# Fit the ensemble classifier on the training data
ensemble_classifier.fit(X_resampled, y_resampled)

# Make predictions on the training set using the ensemble classifier
y_pred_ensemble_train = ensemble_classifier.predict(X_train_subset)

# Now you can use y_pred_ensemble_train for further analysis
# For example, if you want to print the predictions:
print(f'Training Predictions using Ensemble:\n{y_pred_ensemble_train}')

# Prepare the submission file
submission_df = pd.DataFrame({'series_id': df_train_for_predictions['series_id'], 'predicted_event': y_pred_ensemble_train})

# Save the submission file
submission_df.to_csv('/kaggle/working/submission_train.csv', index=False)


Columns in the training set: Index(['step', 'timestamp'], dtype='object')
Training Predictions using RandomForestClassifier:
[0 1 0 ... 1 0 0]
Training Predictions using XGBClassifier:
[0 1 0 ... 1 0 0]
Training Predictions using GradientBoostingClassifier:
[0 1 0 ... 1 1 1]
Training Predictions using Ensemble:
[0 1 0 ... 1 0 0]
