In [1]:
import pandas as pd
import scipy as scipy
import numpy as np

import sklearn
from sklearn import preprocessing, linear_model, model_selection, metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score


In [2]:
def load_dataset(file_name, label_column,no_hybrid=False):
    
    df = pd.read_csv(file_name)
    df = df.loc[:,~df.columns.str.match("Unnamed")]

    cols = ['Top Terpene','Feeling_1','Feeling_2','Feeling_3','Feeling_4','Feeling_5','Negative_1','Negative_2','Negative_3','Negative_4','Negative_5','Helps with_1','Helps with_2','Helps with_3','Helps with_4','Helps with_5','Flavor_1','Flavor_2','Flavor_3']

    for col in cols:
        df[col] = LabelEncoder().fit_transform(df[col])

    df = df.drop(columns=['Strain Name'])

    if no_hybrid == True:
        df = df[df['Type'] != 'Hybrid']

    X = df.loc[:,df.columns != label_column].copy()
    y = df[label_column]


    return X, y

In [3]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)

    return X_train, X_test, y_train, y_test

In [4]:

def scale_features(X_train, scale_type):

    if scale_type == 'minmax':

        scaler = MinMaxScaler(feature_range=(0,1))

    elif scale_type == 'standard':

        scaler = StandardScaler()

    X_train_scaled = scaler.fit_transform(X_train)   

    return scaler, X_train_scaled


In [5]:

def scale_test_features(X_test, scaler):

    scaled = scaler.transform(X_test)


    return scaled

In [6]:
def train_classifier(X_train, y_train):

    classification_model = LogisticRegression(max_iter=9000).fit(X_train,y_train)

    return classification_model


In [7]:

def predict(classifier, X_test, y_test):
    
    y_pred = classifier.predict(X_test)

    return y_pred


In [8]:
def evaluate_performance(y_test,y_predicted):

    evaluate_value = metrics.f1_score(y_test,y_predicted,average='micro')

    return evaluate_value

In [10]:
## Train and Predict - All data ##

file_name = 'CSVs/clean_df(post EDA).csv'
category_col_name = 'Type'

X, y = load_dataset(file_name, category_col_name)

X_train, X_test, y_train, y_test = split_to_train_and_test(X, y, 0.3, 14)


standard_scaler, X_train_standard_scaled = scale_features(X_train, 'standard')
minmax_scaler, X_train_minmax_scaled = scale_features(X_train, 'minmax')
X_test_standard_scaled = scale_test_features(X_test, standard_scaler)
X_test_minmax_scaled = scale_test_features(X_test, minmax_scaler)

classification_model = train_classifier(X_train, y_train)
classification_standard_model = train_classifier(X_train_standard_scaled, y_train)
classification_minmax_model = train_classifier(X_train_minmax_scaled, y_train)

df_res = predict(classification_model, X_test, y_test)
df_standard_res = predict(classification_standard_model, X_test_standard_scaled, y_test)
df_minmax_res = predict(classification_minmax_model, X_test_minmax_scaled, y_test)

y_pred_1st= pd.Series(df_res,index=X_test.index)
eval_res_1st = evaluate_performance(y_test, y_pred_1st)

y_pred_1st_standard= pd.Series(df_standard_res,index=X_test.index)
eval_res_1st_standard = evaluate_performance(y_test, y_pred_1st_standard)

y_pred_1st_minmax= pd.Series(df_minmax_res,index=X_test.index)
eval_res_1st_minmax = evaluate_performance(y_test, y_pred_1st_minmax)


print(f"No sclaing (F1 Score): {eval_res_1st}")
print(f"Standard scaling (F1 Score): {eval_res_1st_standard}")
print(f"MinMax scaling (F1 Score): {eval_res_1st_minmax}")



No sclaing (F1 Score): 0.6483870967741936
Standard scaling (F1 Score): 0.6435483870967742
MinMax scaling (F1 Score): 0.6435483870967742


In [11]:
## Train and Predict - No Hybrid ##

file_name = 'CSVs/clean_df(post EDA).csv'
category_col_name = 'Type'

X, y = load_dataset(file_name, category_col_name,no_hybrid=True)

X_train, X_test, y_train, y_test = split_to_train_and_test(X, y, 0.3, 14)


standard_scaler, X_train_standard_scaled = scale_features(X_train, 'standard')
minmax_scaler, X_train_minmax_scaled = scale_features(X_train, 'minmax')
X_test_standard_scaled = scale_test_features(X_test, standard_scaler)
X_test_minmax_scaled = scale_test_features(X_test, minmax_scaler)

classification_model = train_classifier(X_train, y_train)
classification_standard_model = train_classifier(X_train_standard_scaled, y_train)
classification_minmax_model = train_classifier(X_train_minmax_scaled, y_train)

df_res = predict(classification_model, X_test, y_test)
df_standard_res = predict(classification_standard_model, X_test_standard_scaled, y_test)
df_minmax_res = predict(classification_minmax_model, X_test_minmax_scaled, y_test)

y_pred_1st= pd.Series(df_res,index=X_test.index)
eval_res_1st = evaluate_performance(y_test, y_pred_1st)

y_pred_1st_standard= pd.Series(df_standard_res,index=X_test.index)
eval_res_1st_standard = evaluate_performance(y_test, y_pred_1st_standard)

y_pred_1st_minmax= pd.Series(df_minmax_res,index=X_test.index)
eval_res_1st_minmax = evaluate_performance(y_test, y_pred_1st_minmax)


print(f"No Hybrid - No sclaing (F1 Score): {eval_res_1st}")
print(f"No Hybrid - Standard scaling (F1 Score): {eval_res_1st_standard}")
print(f"No Hybrid - MinMax scaling (F1 Score): {eval_res_1st_minmax}")



No Hybrid - No sclaing (F1 Score): 0.8469750889679716
No Hybrid - Standard scaling (F1 Score): 0.8469750889679716
No Hybrid - MinMax scaling (F1 Score): 0.8362989323843416
