In [9]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, TargetEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import sys


# train_input_file, train_labels_file, test_input_file, numerical_preprocessing, categorical_preprocessing, model_type, test_prediction_output_file = sys.argv[1:]

numerical_preprocessing = 'StandardScaler'
categorical_preprocessing = 'OneHotEncoding'
model_type = 'GradientBoostingClassifier'

# Load the datasets
values_df = pd.read_csv('data/training_set_values.csv')
labels_df = pd.read_csv('data/training_set_labels.csv')

pd.set_option('display.max_columns', None)

# Merge datasets on 'id'
data = values_df.merge(labels_df, on='id')

# Preprocess 'date_recorded' into more usable features
data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['year_recorded'] = data['date_recorded'].dt.year
data['month_recorded'] = data['date_recorded'].dt.month
data['day_recorded'] = data['date_recorded'].dt.day

# Drop original 'date_recorded' column
data.drop('date_recorded', axis=1, inplace=True)

# Automatically identify feature types
categorical_features = data.select_dtypes(include=['object', 'bool']).columns.drop('status_group')
numerical_features = data.select_dtypes(exclude=['object', 'bool']).columns.drop(['id', 'year_recorded', 'month_recorded', 'day_recorded'])


#preprocessing pipelines
numerical_transformers = []

#fill in missing values
numerical_transformers.append(('imputer', SimpleImputer(strategy='mean')))

#optional scaling
if numerical_preprocessing == 'StandardScaler':
    numerical_transformers.append(('scaler', StandardScaler()))



def convert_to_string(X):
    return X.astype(str)

categorical_transformers = []

categorical_transformers.append(('to_string', FunctionTransformer(convert_to_string)))
categorical_transformers.append(('imputer', SimpleImputer(strategy='constant', fill_value='missing')))

if categorical_preprocessing == 'OneHotEncoder':
    categorical_transformers.append(('encoder', OneHotEncoder()))
elif categorical_preprocessing == 'OrdinalEncoder':
    categorical_transformers.append(('encoder', OrdinalEncoder()))
elif categorical_preprocessing == 'TargetEncoder':
    categorical_transformers.append(('encoder', TargetEncoder()))


preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(numerical_transformers), numerical_features),
    ('cat', Pipeline(categorical_transformers), categorical_features)
])


# Split features and labels
X = data.drop(['id', 'status_group'], axis=1)
y = data['status_group']

models = {
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'HistGradientBoostingClassifier': HistGradientBoostingClassifier(),
    'LogisticRegression': LogisticRegression(),
    'MLPClassifier': MLPClassifier()
}


model = models.get(model_type, RandomForestClassifier()) #defaults to random forest classifier

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')

# print(f'Cross-validation scores: {cv_scores}')
# print(f'Average accuracy: {cv_scores.mean()}')

scores = []
for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the model (ensure your model pipeline is defined as 'model')
    pipeline.fit(X_train, y_train)
    
    # Predict and evaluate
    predictions = model.predict(X_test)
    score = accuracy_score(y_test, predictions)
    scores.append(score)
    
    # Print the progress
    print(f"Fold {fold}: Accuracy = {score:.4f}")

# Print overall performance
print(f"Average Cross-Validation Score: {np.mean(scores):.4f}")

ValueError: could not convert string to float: 'Roman'