In [195]:
import ast
import bs4
import time
import json
import requests
import pandas as pd
import scipy as scipy
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.pyplot as plt

import sklearn
from sklearn import preprocessing, linear_model, model_selection, metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score

%matplotlib inline

In [251]:
def load_dataset(file_name, label_column):
    
    df = pd.read_csv(file_name)
    df = df.loc[:,~df.columns.str.match("Unnamed")]

    cols = ['Top Terpene','Feeling_1','Feeling_2','Feeling_3','Feeling_4','Feeling_5','Negative_1','Negative_2','Negative_3','Negative_4','Negative_5','Helps with_1','Helps with_2','Helps with_3','Helps with_4','Helps with_5','Flavor_1','Flavor_2','Flavor_3']

    for col in cols:
        df[col] = LabelEncoder().fit_transform(df[col])

    # df = df.apply(lambda x: pd.factorize(x)[0])


    # df = df.drop(columns=['Strain Name','Rating','Rating Users'])
    df = df.drop(columns=['Strain Name'])

    X = df.loc[:,df.columns != label_column].copy()
    y = df[label_column]


    return X, y

In [207]:
def split_to_train_and_test(X, y, test_ratio, rand_state):
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_ratio,random_state=rand_state)

    return X_train, X_test, y_train, y_test

In [208]:

def scale_features(X_train, scale_type):

    if scale_type == 'minmax':

        scaler = MinMaxScaler(feature_range=(0,1))

    elif scale_type == 'standard':

        scaler = StandardScaler()

    X_train_scaled = scaler.fit_transform(X_train)   

    return scaler, X_train_scaled


In [209]:

def scale_test_features(X_test, scaler):

    scaled = scaler.transform(X_test)


    return scaled

In [210]:
def train_classifier(X_train, y_train):
    
    # classification_model = LogisticRegression().fit(X_train,y_train)
    classification_model = LogisticRegression(max_iter=8000).fit(X_train,y_train)


    return classification_model


In [211]:

def predict(classifier, X_test, y_test):
    
    y_pred = classifier.predict(X_test)
    # resDF = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})

    # return resDF
    return y_pred


In [212]:
def evaluate_performance(y_test,y_predicted):
    
    # evaluate_value = r2_score(y_test,y_predicted)
    evaluate_value = metrics.f1_score(y_test,y_predicted,average='micro')

    return evaluate_value

In [252]:
file_name = 'clean_df.csv'
category_col_name = 'Type'

X, y = load_dataset(file_name, category_col_name)

X_train, X_test, y_train, y_test = split_to_train_and_test(X, y, 0.3, 11)

# X.head()

standard_scaler, X_train_standard_scaled = scale_features(X_train, 'standard')
minmax_scaler, X_train_minmax_scaled = scale_features(X_train, 'minmax')
X_test_standard_scaled = scale_test_features(X_test, standard_scaler)
X_test_minmax_scaled = scale_test_features(X_test, minmax_scaler)

classification_model = train_classifier(X_train, y_train)
classification_standard_model = train_classifier(X_train_standard_scaled, y_train)
classification_minmax_model = train_classifier(X_train_minmax_scaled, y_train)

df_res = predict(classification_model, X_test, y_test)
df_standard_res = predict(classification_standard_model, X_test_standard_scaled, y_test)
df_minmax_res = predict(classification_minmax_model, X_test_minmax_scaled, y_test)

y_pred_1st= pd.Series(df_res,index=X_test.index)
eval_res_1st = evaluate_performance(y_test, y_pred_1st)

y_pred_1st_standard= pd.Series(df_standard_res,index=X_test.index)
eval_res_1st_standard = evaluate_performance(y_test, y_pred_1st_standard)

y_pred_1st_minmax= pd.Series(df_minmax_res,index=X_test.index)
eval_res_1st_minmax = evaluate_performance(y_test, y_pred_1st_minmax)

# X.head()


print(eval_res_1st)
print(eval_res_1st_standard)
print(eval_res_1st_minmax)



0.5854838709677419
0.5790322580645161
0.5935483870967742


In [68]:
df_res.head(20)

Unnamed: 0,Actual,Predicted
2706,Hybrid,Hybrid
1320,Hybrid,Hybrid
147,Hybrid,Hybrid
2324,Hybrid,Hybrid
1001,Hybrid,Hybrid
2438,Hybrid,Hybrid
999,Hybrid,Hybrid
2076,Indica,Indica
2380,Indica,Hybrid
2222,Sativa,Hybrid


In [69]:
df_res_standard.head(20)

Unnamed: 0,Actual,Predicted
2706,Hybrid,Hybrid
1320,Hybrid,Hybrid
147,Hybrid,Hybrid
2324,Hybrid,Hybrid
1001,Hybrid,Hybrid
2438,Hybrid,Hybrid
999,Hybrid,Hybrid
2076,Indica,Indica
2380,Indica,Hybrid
2222,Sativa,Hybrid
