In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import argparse
import sys

In [10]:
sys.argv = ['hackathon_script.py', '--train-file', 'train.csv', '--test-file', 'test.csv', '--predictions-file', 'predictions.csv']

In [11]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-file', type=str, help='Path to the training file (train.csv)')
    parser.add_argument('--test-file', type=str, help='Path to the test file (test.csv)')
    parser.add_argument('--predictions-file', type=str, help='Path to save the predictions')
    args = parser.parse_args()


In [12]:
train_data = pd.read_csv('train.csv')

In [13]:
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns

In [14]:
train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())

In [15]:
categorical_columns = train_data.select_dtypes(include=['object']).columns

In [16]:
for col in categorical_columns:
    mode_value = train_data[col].mode()
    if not mode_value.empty:
        train_data[col] = train_data[col].fillna(mode_value.iloc[0])
    else:
        train_data[col] = train_data[col].fillna('Unknown') 

In [17]:
label_encoder = LabelEncoder()

In [18]:
train_data['Target'] = label_encoder.fit_transform(train_data['Target'])

In [19]:
train_data['TotalCultivatedArea'] = train_data['CultivatedAreaSqft1'] + train_data['FarmEquipmentArea']

In [20]:
X = train_data.drop(columns=['UID', 'Target'])  # Features
y = train_data['Target'] 

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
rf_model = RandomForestClassifier(random_state=42)

In [23]:
param_dist = {
    'n_estimators': [100, 200, 300, 500],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4],  # Minimum samples at leaf node
    'max_features': ['sqrt', 'log2'],  # Max features considered for splitting
    'class_weight': ['balanced', None],  # Handle imbalanced classes
    'bootstrap': [True, False]  # Use bootstrap samples or not
}

In [24]:
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=10,  # Number of iterations for random search
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),  # Stratified KFold cross-validation
    scoring='f1_macro',  # Optimize for F1 score (macro)
    verbose=2,
    n_jobs=-1,  # Use all available cores for parallel computation
    random_state=42
)


In [25]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [26]:
best_rf_model = random_search.best_estimator_

In [27]:
y_pred = best_rf_model.predict(X_val)

In [28]:
f1 = f1_score(y_val, y_pred, average='macro')

In [29]:
f1

0.4354382860938036

In [30]:
def make_predictions(test_fname,predictions_fname):
    test_data = pd.read_csv(test_fname)
    test_data[numeric_columns] = test_data[numeric_columns].fillna(test_data[numeric_columns].median())
    for col in categorical_columns:
        mode = train_data[col].mode()  # Using train data's mode for test data
        if not mode.empty:
            test_data[col] = test_data[col].fillna(mode.iloc[0])
        else:
            test_data[col] = test_data[col].fillna('Unknown') 
    test_data['TotalCultivatedArea'] = test_data['CultivatedAreaSqft1'] + test_data['FarmEquipmentArea']
    test_features = test_data.drop(columns=['UID'])
    test_predictions = best_rf_model.predict(test_features)
    test_predictions_labels = label_encoder.inverse_transform(test_predictions)
    submission = pd.DataFrame({
        'UID': test_data['UID'],
        'Target': test_predictions_labels
    })
    submission.to_csv(predictions_fname, index=False)
