# All required libraries are imported here for you.
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Load the dataset
crops = pd.read_csv("soil_measures.csv")

# # Write your code here
# print(len(crops))
# print(crops.info())
# print(crops.isna().sum().sort_values())
# print(crops['crop'].value_counts())

X = crops[['N', 'P', 'K', 'ph']]
y = crops['crop']
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, stratify=y, random_state=42)


# params = {"Nitrogen": "N", "phosphorous": "P", "Potassium": "K", "ph":'ph'}
features_dict = {}
for feature in ['N', 'P', 'K', 'ph']:
    logreg = LogisticRegression(multi_class='multinomial', max_iter=1000)
    logreg.fit(X_train[[feature]], y_train)
    y_pred = logreg.predict(X_test[[feature]])
    feature_performance = metrics.f1_score(y_test, y_pred, average='weighted')
    features_dict[feature] = feature_performance
    print(feature_performance)

best_feature = max(features_dict, key=features_dict.get)
best_score = features_dict[best_feature]

# Store the best predictive feature in a dictionary
best_predictive_feature = {best_feature: best_score}

# Print the best predictive feature and its score
print(f"Best predictive feature: {best_predictive_feature}")
