In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from scipy.stats import randint

# Load the datasets with error handling for bad rows
operations_labels_train = pd.read_csv('/content/operations_labels_training.csv')

# Use on_bad_lines='skip' to skip problematic rows
telemetry_train = pd.read_csv('/content/telemetry_for_operations_training.csv', on_bad_lines='skip')

# Validation data is unlikely to have issues, but we can still add on_bad_lines just in case
telemetry_validation = pd.read_csv('/content/telemetry_for_operations_validation.csv', on_bad_lines='skip')

# Convert time columns to datetime
operations_labels_train['start_time'] = pd.to_datetime(operations_labels_train['start_time'], errors='coerce')
operations_labels_train['end_time'] = pd.to_datetime(operations_labels_train['end_time'], errors='coerce')
telemetry_train['create_dt'] = pd.to_datetime(telemetry_train['create_dt'], errors='coerce')
telemetry_validation['create_dt'] = pd.to_datetime(telemetry_validation['create_dt'], errors='coerce')

# Drop rows with invalid datetimes in the training set
telemetry_train = telemetry_train.dropna(subset=['create_dt'])

# Merge telemetry data with labels based on time intervals using a more efficient approach
telemetry_train = telemetry_train.merge(
    operations_labels_train,
    how='left',
    left_on=['mdm_object_name', 'create_dt'],
    right_on=['mdm_object_name', 'start_time'],
    suffixes=('', '_label')
)

# Now filter based on the time conditions
telemetry_train['operation_kind_id'] = telemetry_train.apply(
    lambda row: row['operation_kind_id'] if row['start_time'] <= row['create_dt'] <= row['end_time'] else None,
    axis=1
)

# Drop rows where no operation_kind_id could be assigned
telemetry_train = telemetry_train.dropna(subset=['operation_kind_id'])

# Convert operation_kind_id to integers
telemetry_train.loc[:, 'operation_kind_id'] = telemetry_train['operation_kind_id'].astype(int)

# Feature Engineering - Extract time-based features (hour, day of week)
telemetry_train['hour'] = telemetry_train['create_dt'].dt.hour
telemetry_train['dayofweek'] = telemetry_train['create_dt'].dt.dayofweek
telemetry_validation['hour'] = telemetry_validation['create_dt'].dt.hour
telemetry_validation['dayofweek'] = telemetry_validation['create_dt'].dt.dayofweek

# Drop datetime and non-numeric columns before imputation and scaling
telemetry_train_cleaned = telemetry_train.drop(columns=['create_dt', 'mdm_object_name', 'start_time', 'end_time'])

# Ensure only numeric columns are retained
telemetry_train_cleaned = telemetry_train_cleaned.select_dtypes(include=[np.number])

# Split the features and target variable
X_train_features = telemetry_train_cleaned.drop(columns=['operation_kind_id'])
y_train_target = telemetry_train_cleaned['operation_kind_id']

# Handling missing values for numeric columns only
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_features), columns=X_train_features.columns)

# Standardization of features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=X_train_imputed.columns)

# Feature Selection using RandomForest
rf_feature_selector = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
rf_feature_selector.fit(X_train_scaled, y_train_target)

# Select features based on importance
model = SelectFromModel(rf_feature_selector, prefit=True)
X_selected = model.transform(X_train_scaled)

# Prepare data for model training
X = pd.DataFrame(X_selected)
y = y_train_target

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Hyperparameter Tuning with RandomizedSearchCV for RandomForest
param_dist_rf = {
    'n_estimators': randint(50, 100),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 5),
    'min_samples_leaf': randint(1, 3)
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_dist_rf, n_iter=5, cv=3, n_jobs=-1, scoring='f1_weighted', random_state=42)
random_search_rf.fit(X_train, y_train)

best_rf = random_search_rf.best_estimator_

# Model Ensemble - RandomForest and GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=42)
voting_clf = VotingClassifier(estimators=[('rf', best_rf), ('gbc', gbc)], voting='soft', n_jobs=-1)
voting_clf.fit(X_train, y_train)

# Cross-validation using 3-fold KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42)
cv_scores = []
for train_index, test_index in kf.split(X):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    voting_clf.fit(X_train_fold, y_train_fold)
    y_pred_fold = voting_clf.predict(X_test_fold)
    cv_scores.append(f1_score(y_test_fold, y_pred_fold, average='weighted'))

print(f'Mean Cross-Validation F1 Score: {np.mean(cv_scores)}')

# Predict on the test set and evaluate the model
y_pred = voting_clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Weighted F1 Score on Test Set: {f1}')
print(classification_report(y_test, y_pred))

# Clean and preprocess the validation set
telemetry_validation_cleaned = telemetry_validation.drop(columns=['create_dt', 'mdm_object_name'])

# Ensure only numeric columns are retained and match the training set
telemetry_validation_cleaned = telemetry_validation_cleaned.select_dtypes(include=[np.number])

# Match the validation set columns to the training set
columns_to_keep = X_train_features.columns
telemetry_validation_cleaned = telemetry_validation_cleaned.reindex(columns=columns_to_keep, fill_value=0)

# Impute and scale the validation set
telemetry_validation_imputed = pd.DataFrame(imputer.transform(telemetry_validation_cleaned), columns=telemetry_validation_cleaned.columns)
telemetry_validation_scaled = pd.DataFrame(scaler.transform(telemetry_validation_imputed), columns=telemetry_validation_imputed.columns)

# Select features for the validation set
X_validation_selected = model.transform(telemetry_validation_scaled)

# Predicting on validation set
validation_preds = voting_clf.predict(X_validation_selected)

# Ensure the original validation set length (260111 rows)
submission = pd.DataFrame({
    'create_dt': telemetry_validation['create_dt'],
    'mdm_object_name': telemetry_validation['mdm_object_name'],
    'operation_kind_id': validation_preds
})

# Ensure the submission file has 260111 rows
assert submission.shape[0] == 260111, "Submission file does not contain the correct number of rows."

# Save the submission with Windows-friendly line terminators
submission.to_csv('submission.csv', index=False, lineterminator='\r\n')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  telemetry_train['hour'] = telemetry_train['create_dt'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  telemetry_train['dayofweek'] = telemetry_train['create_dt'].dt.dayofweek


Mean Cross-Validation F1 Score: 0.3836508836594053
Weighted F1 Score on Test Set: 0.6704197349826397
              precision    recall  f1-score   support

         0.0       0.69      0.54      0.61       136
         1.0       0.62      0.64      0.63       171
         2.0       0.61      0.63      0.62       167
         3.0       0.66      0.87      0.75       139
         5.0       0.78      0.68      0.73       226

    accuracy                           0.67       839
   macro avg       0.67      0.67      0.67       839
weighted avg       0.68      0.67      0.67       839



