In [1]:
import sys
import numpy as np

from sklearn.model_selection import train_test_split  # type: ignore
from sklearn.preprocessing import StandardScaler  # type: ignore
from sklearn.svm import SVC  # type: ignore
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score  # type: ignore
from imblearn.over_sampling import RandomOverSampler  # type: ignore

sys.path.append("../")
from Datasets.DataCreation import getDatasetV1, getDatasetV2, getDatasetV3  # type: ignore

In [2]:
df = getDatasetV3(None)
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)
df.drop(columns=["year", "district"], inplace=True)

148916


In [3]:
# X = df.drop(columns=["has_ergot"])
# y = df["has_ergot"]

X = df.drop(columns=["incidence"])
y = df["incidence"]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print(y_train.value_counts())

# Oversampling
ros = RandomOverSampler(random_state=1)
X_train, y_train = ros.fit_resample(X_train, y_train)

print(y_train.value_counts())

incidence
True     3331
False     774
Name: count, dtype: int64
incidence
False    3331
True     3331
Name: count, dtype: int64


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
X_train

Unnamed: 0,severity,soil_moisture_mean,avg_percnt_coarse_frag,avg_total_sand,avg_total_silt,avg_total_clay,avg_percnt_carbon,avg_calcium_ph,avg_proj_ph,avg_water_reten_0,...,avg_elec_cond,avg_percnt_wood,avg_water_holding_cap,avg_land_area,avg_water_area,mean_temp,mean_total_snow,mean_total_precip,mean_snow_on_grnd,mean_total_rain
0,0.000,0.172607,0.690591,10.608514,10.460087,7.514317,0.259585,2.069420,2.157495,13.648780,...,0.304799,-2.574322,1.137961,13087.528444,192.432890,6.008826,0.131742,0.475303,0.180871,0.343601
1,0.006,0.192501,0.048360,6.071709,4.136575,2.939626,0.264659,0.956594,0.981140,6.460404,...,0.114226,-1.206384,0.521358,4103.058734,110.367056,0.392329,0.000000,0.552603,12.706849,0.000000
2,0.000,0.204526,0.814580,10.151682,8.183592,5.607801,0.194739,1.715932,1.792265,11.239293,...,0.333493,-2.156050,0.905770,12770.267922,221.887176,4.070509,0.000000,0.949458,0.103789,0.000000
3,0.040,0.227223,1.485562,8.544626,9.731278,6.401610,1.687239,1.706077,1.850240,15.535636,...,0.019440,-2.115329,1.200398,17628.060624,450.470480,3.186520,0.053819,1.257928,1.328899,0.142217
4,0.012,0.235187,0.886749,5.456843,8.049377,6.355707,2.786215,1.526546,1.652509,16.022075,...,0.070922,-1.688563,1.240568,20684.945297,464.173316,0.858151,0.058679,1.064609,4.210781,0.085503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6657,0.000,0.225981,0.056900,7.443422,4.664707,3.409830,0.127013,1.114418,1.146940,7.307032,...,0.147618,-1.396616,0.602533,10613.589319,224.180662,3.535068,0.000000,1.460000,0.000000,0.000000
6658,0.000,0.211599,0.814580,10.151682,8.183592,5.607801,0.194739,1.715932,1.792265,11.239293,...,0.333493,-2.156050,0.905770,12770.267922,221.887176,4.051066,0.000000,1.058513,0.169534,0.000000
6659,0.000,0.221379,0.081000,6.448975,6.194975,6.908550,0.227088,1.428813,1.457647,9.304375,...,0.290675,-1.759725,0.809725,19982.807400,142.238975,3.756986,0.000000,1.391781,7.095890,0.000000
6660,0.000,0.222016,0.064457,4.404109,3.372019,2.584303,1.705210,0.925246,0.962430,8.283017,...,0.053011,-1.012090,0.599185,5508.617893,125.157639,0.325114,0.000000,0.832237,3.873288,0.000000


In [7]:
model = SVC(kernel="rbf", C=1)

In [8]:
model.fit(X_train_scaled, y_train)

In [9]:
y_pred = model.predict(X_test_scaled)

In [10]:
accuracy_score(y_test, y_pred)

0.6153846153846154

In [11]:
def result(y_test, y_pred):
    conf_matrix = confusion_matrix(y_test, y_pred)

    accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
    print("Accuracy: ", accuracy)

    precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])
    print("Precision: ", precision)

    recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
    print("Recall: ", recall)

    f1_score = 2 * (precision * recall) / (precision + recall)
    print("F1 Score: ", f1_score)

    auc_score = roc_auc_score(y_test, y_pred)
    print("AUC Score: ", auc_score)

In [12]:
kernals = ["linear", "poly", "rbf", "sigmoid"]
c_list = [0.1, 1, 10, 100]

for k in kernals:
    for c in c_list:
        print("--------------------------------------")
        print(f"Kernal = {k}, C = {c}")
        model = SVC(kernel=k, C=c)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        result(y_test, y_pred)

--------------------------------------
Kernal = linear, C = 100
