In [8]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [9]:
crops = pd.read_csv("soil_measures.csv")
summary = pd.DataFrame({
    "Column": crops.columns,
    "DataType": crops.dtypes,
    "NonNullCount": crops.notnull().sum(),
})
print(summary)
print("CropTypes", crops.crop.unique())

     Column DataType  NonNullCount
N         N    int64          2200
P         P    int64          2200
K         K    int64          2200
ph       ph  float64          2200
crop   crop   object          2200
CropTypes ['rice' 'maize' 'chickpea' 'kidneybeans' 'pigeonpeas' 'mothbeans'
 'mungbean' 'blackgram' 'lentil' 'pomegranate' 'banana' 'mango' 'grapes'
 'watermelon' 'muskmelon' 'apple' 'orange' 'papaya' 'coconut' 'cotton'
 'jute' 'coffee']


In [10]:
y = crops['crop']
X = crops.drop('crop', axis=1)
X = X[y.notna()]
y = y[y.notna()]
print(X.shape, y.shape)

(2200, 4) (2200,)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
feature_performance = {}

In [13]:
scaler = StandardScaler()

for feature in ["N", "P", "K", "ph"]:
    X_train_scaled = scaler.fit_transform(X_train[[feature]])
    X_test_scaled = scaler.transform(X_test[[feature]])
    log_reg = LogisticRegression(max_iter=500)
    log_reg.fit(X_train_scaled, y_train)
    y_pred = log_reg.predict(X_test_scaled)
    feature_performance[feature] = metrics.f1_score(y_test, y_pred, average='weighted')

In [14]:
best_feature = max(feature_performance, key=feature_performance.get)
print(best_feature)

K
