<a href="https://colab.research.google.com/github/Denko-Sekka/Colab/blob/master/Interactive%20Charts/interactive_charts_SEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
## Install utilities
# !pip install pandas
# !pip install numpy
# !pip install tensorflow
# !pip install matplotlib
# !pip install altair
# !pip install sklearn

# # Update utilities
# !pip install -q pandas
# !pip install -q numpy
# !pip install -q tensorflow
# !pip install -q matplotlib
# !pip install -q altair
# !pip install -q sklearn

In [2]:
import pandas as pd
import numpy as np
print(pd.__version__)

## allows intellisense-like feel to the notebook
%config IPCompleter.greedy=True
##  whitespace cleaning
import re
# import webcolors ## package not in colab, need to check if supportable
from IPython.display import Image
##  histogram plotting
import matplotlib.pyplot as plt
##  3d plotting
from mpl_toolkits.mplot3d import Axes3D
## Used by matplotlib to remove duplicate legend handles
from collections import OrderedDict

## used for loading files
from google.colab import files

## used for loading files from google drive
from google.colab import drive


## used for interactive charts
import altair as alt

## converts the dictionary from google.colab's upload file into a stream
import io

0.25.3


# Choose File upload to Colab. 1. Upload file from computer. 2. Upload file from google drive.

In [3]:
# # 1 Upload from file. Uncomment the code
# uploaded = files.upload() ## prompts csv upload
# print(type(uploaded[list(uploaded.keys())[0]]))

# # the files are store as single entry dictionary
# # to convert the file to a csv file and fed into read_csv, the values must be read.
# # python dictionary keys/items does not allow indexing normally, you need import iteritems or convert it.
# # Take the latter option so we convert it to a list and take the first and only entry.
# # This value is class 'byte' it needs to be converted into an object that is readable by csv_read
# # we convert it from bytes class to object using io.BytesIO and then fed to csv to be converted into a dataframe
# frame = pd.read_csv(io.BytesIO(uploaded[list(uploaded.keys())[0]]))

# ***********************************************************************************************************

# 2 Upload file from google drive. Uncomment the code
## follow the instructions and authenticate the google drive
drive.mount('/content/gdrive')

## From the left side tab of google colab, obtain the path to the csv
## Simply right click the csv file and copy path. Paste into csv_path
csv_path = ['/content/gdrive/My Drive/BlueberryProject/csv files/SoftSpotCamposol MachineA SOFT_Whole.csv']
frame = pd.DataFrame()
for path in csv_path:
  temp_frame = pd.read_csv(path)
  frame = frame.append(temp_frame)
# frame = pd.read_csv(csv_path)
## if everything works, it should show the content
print(frame.head())
##  change the indexing to include column1(filenames)
# frame = frame.set_index([frame.index,frame.columns[0]])

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
              FileName  Red_Mean  ...  SOFT2_Max  Blueberry Has Split Truth
0  180830 100911 0.gsi    25.333  ...    113.639                      False
1  180830 100920 8.gsi    23.021  ...    117.250                      False
2  180830 100930 5.gsi    26.378  ...    125.146                      False
3  180830 100933 6.gsi    23.953  ...    121.310                      False
4  180830 100937 1.gsi    24.384  ...    119.091                      False

[5 rows x 38 columns]


In [4]:
## verify that it works
frame.head(1)

Unnamed: 0,FileName,Red_Mean,Green_Mean,Blue_Mean,Hue_Mean,Sat_Mean,Lum_Mean,IR_Mean,SOFT_Mean,SOFT2_Mean,Red_Median,Green_Median,Blue_Median,Hue_Median,Sat_Median,Lum_Median,IR_Median,SOFT_Median,SOFT2_Median,Red_Min,Green_Min,Blue_Min,Hue_Min,Sat_Min,Lum_Min,IR_Min,SOFT_Min,SOFT2_Min,Red_Max,Green_Max,Blue_Max,Hue_Max,Sat_Max,Lum_Max,IR_Max,SOFT_Max,SOFT2_Max,Blueberry Has Split Truth
0,180830 100911 0.gsi,25.333,24.755,30.148,171.014,30.243,27.518,72.95,103.799,109.702,25.658,24.702,30.516,170.452,30.607,27.754,73.091,104.243,111.615,21.804,20.834,24.981,168.616,26.497,23.075,68.335,97.991,102.214,27.641,26.713,32.781,174.592,32.574,29.739,77.95,108.083,113.639,False


# HELPER FUNCTIONS

In [0]:
def complexCharts(frame: pd.DataFrame, col : 'List[string]') -> list:
  '''
  What it does: It feeds the selected(samples inside the selected x-interval) samples from chart1 to chart2,
  chart 2 to chart 3, ... so on and so forth, this effectively culls to how the user wishes.

  What it's for: Threshold based classification modeling -> you have a data set is already classified as class 1, 2, 3, 4...,etc
  presumambly the class is the last column.

  How this works: First, the brushes can be passed to the charts as boolean operations
  For example:
  
  '''
  if(frame is None):
    print('Brushcount is 0 or frame is null, returning')
    return

  _maxlen = len(col)
  brush_lst = []
  chart_lst = []
  _classification = frame.columns[-1]
  # Base chart for Matrix
  fp = alt.Chart(frame[frame[_classification] == 0]).mark_text().encode(
    y=alt.Y('count:O',axis=None)
  )

  tp = alt.Chart(frame[frame[_classification] == 1]).mark_text().encode(
    y=alt.Y('count:O',axis=None)
  )
    
  for i in range(_maxlen):
    brush_lst.append(alt.selection_interval(encodings=['x']))
  chart_lst.append(alt.Chart(frame).mark_circle(opacity = 0.2, size = 100).encode(
    x = col[0] + ':Q',
    y = alt.Y(_classification + ':N', title = 'SoftSpot Truth'),
    color = alt.condition(brush_lst[0], _classification+':N', alt.value('lightgray'), legend = None),
  ))
  chart_lst[0] = chart_lst[0].add_selection(
    brush_lst[0]
  )

  pbrush_lst = [brush_lst[0]]
  for i in range(1, _maxlen):
    pbrush_lst.append(pbrush_lst[i-1] & brush_lst[i])
    chart_lst.append(alt.Chart(frame).mark_circle(opacity = 0.2, size = 100).encode(
    x = col[i] + ':Q',
    y = alt.Y(_classification+':N', title = 'SoftSpot Truth'),
    color = alt.condition(pbrush_lst[i], _classification+':N', alt.value('lightgray'), legend = None),
  ).add_selection(
    brush_lst[i]
  ).transform_filter(
      pbrush_lst[i-1]
  ))
  
  for i in range(_maxlen):
    chart_lst[i] = chart_lst[i] | (fp.encode(text = 'count():N', key = _classification).properties(title='False Count').transform_filter(pbrush_lst[i])
  & tp.encode(text = 'count():N', key = _classification).properties(title='True Count').transform_filter(pbrush_lst[i])
  )
  
  return chart_lst

def removeColumn(frame : pd.DataFrame, columnsToRemove : 'List[string]') -> pd.DataFrame:
  '''
  Peturns a dataframe without the listed columns.
  '''
  if frame is None:
    print('Frame is null')
    return None
  if columnsToRemove is None or len(columnsToRemove) == 0:
    return frame
  for c in columnsToRemove:
      if c in frame.columns:
          frame = frame.drop(c, 1)
  return frame

In [6]:
import math as math
## Add column names here to remove them
columnsToRemove = []

frame.head(2)
# append new features to the end of the frame
frame['Blueberry Has Split Truth'] = frame['Blueberry Has Split Truth']*1 # Convert T/F to 1/0
frame['SOFT2_MinMaxMean'] = (frame['SOFT2_Max'].subtract(frame['SOFT2_Min']))
frame['SOFT2_MinMaxRatio'] = frame['SOFT2_Mean'].multiply(frame['SOFT2_Max'].divide(frame['SOFT2_Min']))
frame['SOFT2_MeanXMin'] = frame['SOFT2_Mean'].add(frame['SOFT2_Min'])
# frame['SatLogSOFT2'] = [math.log(frame['SOFT2_Max'][i],frame['Sat_Max'][i]) for i in range(len(frame['Blueberry Has Split Truth']))]
frame['SatLogSOFT2'] = frame['SOFT2_Mean'].divide(frame['Sat_Median'])
# frame['Soft2IRRatio'] = [math.log(frame['SOFT2_Mean'][i],frame['IR_Mean'][i]) for i in range(len(frame['Blueberry Has Split Truth']))]
frame['Soft2IRRatio'] = frame['SOFT2_Mean'].divide(frame['IR_Mean'])
# frame['SOFT2_SatLogSOFT2'] = pd.DataFrame())
frame['Soft2RedRatio'] = frame['SOFT2_Mean'].divide(frame['Red_Mean'])
frame['Soft-Soft2abs'] = frame['SOFT_Mean'].subtract(frame['SOFT2_Mean'])

# Has Split Truth needs to be put back to the end of the columns, we pop it out and append it to the back end
frame['Blueberry Has Split Truth'] = frame.pop('Blueberry Has Split Truth')

## remove columns that were unused in the data
frame = removeColumn(frame, columnsToRemove)
frame.head()

Unnamed: 0,FileName,Red_Mean,Green_Mean,Blue_Mean,Hue_Mean,Sat_Mean,Lum_Mean,IR_Mean,SOFT_Mean,SOFT2_Mean,Red_Median,Green_Median,Blue_Median,Hue_Median,Sat_Median,Lum_Median,IR_Median,SOFT_Median,SOFT2_Median,Red_Min,Green_Min,Blue_Min,Hue_Min,Sat_Min,Lum_Min,IR_Min,SOFT_Min,SOFT2_Min,Red_Max,Green_Max,Blue_Max,Hue_Max,Sat_Max,Lum_Max,IR_Max,SOFT_Max,SOFT2_Max,SOFT2_MinMaxMean,SOFT2_MinMaxRatio,SOFT2_MeanXMin,SatLogSOFT2,Soft2IRRatio,Soft2RedRatio,Soft-Soft2abs,Blueberry Has Split Truth
0,180830 100911 0.gsi,25.333,24.755,30.148,171.014,30.243,27.518,72.95,103.799,109.702,25.658,24.702,30.516,170.452,30.607,27.754,73.091,104.243,111.615,21.804,20.834,24.981,168.616,26.497,23.075,68.335,97.991,102.214,27.641,26.713,32.781,174.592,32.574,29.739,77.95,108.083,113.639,11.425,121.963973,211.916,3.584213,1.503797,4.330399,-5.903,0
1,180830 100920 8.gsi,23.021,23.009,27.695,166.117,29.386,25.283,49.048,91.313,111.522,22.537,22.016,26.213,165.767,29.469,24.243,48.345,91.189,110.816,20.106,20.028,24.407,161.639,27.824,22.197,45.326,84.882,102.383,26.254,26.7,32.069,169.935,30.48,29.16,52.592,96.704,117.25,14.867,127.716071,213.905,3.784384,2.273732,4.844359,-20.209,0
2,180830 100930 5.gsi,26.378,24.485,29.689,181.0,28.833,27.411,93.908,117.089,113.831,25.253,23.099,28.197,180.862,28.756,25.874,92.283,115.099,113.162,23.956,21.755,26.343,175.028,27.708,24.478,86.357,105.203,100.464,30.319,28.62,34.471,187.385,30.698,31.853,104.208,127.625,125.146,24.682,141.797005,214.295,3.958513,1.212154,4.315376,3.258,0
3,180830 100933 6.gsi,23.953,22.663,26.924,171.362,27.83,25.111,75.283,104.683,110.388,23.767,22.423,26.938,171.16,27.228,25.026,74.92,103.086,109.419,21.431,19.632,23.08,166.242,25.532,21.868,70.703,95.191,98.992,26.384,25.373,29.824,176.114,30.276,27.848,79.462,112.574,121.31,22.318,135.275257,209.38,4.054209,1.466307,4.608525,-5.705,0
4,180830 100937 1.gsi,24.384,23.988,29.239,170.543,30.453,26.623,70.709,106.869,114.455,24.311,23.863,29.065,170.582,30.589,26.495,70.168,106.593,115.017,20.654,20.07,24.362,168.167,28.714,22.301,64.973,100.302,106.87,27.059,27.004,32.994,172.638,31.788,29.913,76.398,112.293,119.091,12.221,127.543374,221.325,3.741705,1.618677,4.693857,-7.586,0


In [7]:
select_n = frame.columns[1:len(frame.columns) - 1] ## select all the columns except if defect is true
print(select_n)
frame_n = frame[select_n] / frame[select_n].max(axis=0) ## normalize the values of every column
frame_n['Blueberry Has Split Truth'] = frame['Blueberry Has Split Truth']
print(frame_n)

Index(['Red_Mean', 'Green_Mean', 'Blue_Mean', 'Hue_Mean', 'Sat_Mean',
       'Lum_Mean', 'IR_Mean', 'SOFT_Mean', 'SOFT2_Mean', 'Red_Median',
       'Green_Median', 'Blue_Median', 'Hue_Median', 'Sat_Median', 'Lum_Median',
       'IR_Median', 'SOFT_Median', 'SOFT2_Median', 'Red_Min', 'Green_Min',
       'Blue_Min', 'Hue_Min', 'Sat_Min', 'Lum_Min', 'IR_Min', 'SOFT_Min',
       'SOFT2_Min', 'Red_Max', 'Green_Max', 'Blue_Max', 'Hue_Max', 'Sat_Max',
       'Lum_Max', 'IR_Max', 'SOFT_Max', 'SOFT2_Max', 'SOFT2_MinMaxMean',
       'SOFT2_MinMaxRatio', 'SOFT2_MeanXMin', 'SatLogSOFT2', 'Soft2IRRatio',
       'Soft2RedRatio', 'Soft-Soft2abs'],
      dtype='object')
     Red_Mean  Green_Mean  ...  Soft-Soft2abs  Blueberry Has Split Truth
0    0.913889    0.868901  ...      -1.186294                          0
1    0.830483    0.807617  ...      -4.061294                          0
2    0.951587    0.859424  ...       0.654743                          0
3    0.864105    0.795472  ...      -1.146503 

# The data frame is split into their respective categories: Mean, Median, Min, Max. To be used for visualization purposes.

In [0]:
## Separate each of the features into its smaller data frames for better visualization
Mean = pd.DataFrame(frame[['Red_Mean','Blue_Mean', 'Green_Mean', 'Hue_Mean', 'Sat_Mean', 'Lum_Mean', 'IR_Mean', 'SOFT_Mean', 'SOFT2_Mean', 'Blueberry Has Split Truth']], index = frame.index)
Median = pd.DataFrame(frame[['Red_Median', 'Blue_Median', 'Green_Median','Hue_Median','Sat_Median','Lum_Median','IR_Median','SOFT_Median','SOFT2_Median','Blueberry Has Split Truth']], index = frame.index)
Min = pd.DataFrame(frame[['Red_Min', 'Blue_Min', 'Green_Min', 'Hue_Min','Sat_Min','Lum_Min','IR_Min','SOFT_Min','SOFT2_Min', 'Blueberry Has Split Truth']], index = frame.index)
Max = pd.DataFrame(frame[['Red_Max', 'Blue_Max', 'Green_Max', 'Hue_Max','Sat_Max','Lum_Max','IR_Max','SOFT_Max','SOFT2_Max', 'Blueberry Has Split Truth']], index = frame.index)
Misc = pd.DataFrame(frame[['SOFT2_MeanXMin','SOFT2_MinMaxMean', 'SatLogSOFT2', 'Soft2IRRatio','Soft2RedRatio', 'Soft-Soft2abs',  'Blueberry Has Split Truth']], index = frame.index)

# Correlation statistics

In [0]:
# maskMean = np.triu(np.ones(Mean.corr().shape).astype(np.bool))
# corrMean = Mean.corr()

# maskMedian = np.triu(np.ones(Median.corr().shape).astype(np.bool))
# corrMedian = Median.corr()

# maskMin = np.triu(np.ones(Min.corr().shape).astype(np.bool))
# corrMin = Min.corr()

# maskMax = np.triu(np.ones(Max.corr().shape).astype(np.bool))
# corrMax = Max.corr()

# maskMisc = np.triu(np.ones(Misc.corr().shape).astype(np.bool))
# corrMisc = Misc.corr()

sorted_corr_mean = pd.DataFrame(Mean.corr()['Blueberry Has Split Truth'].iloc[Mean.corr()['Blueberry Has Split Truth'].abs().argsort()][::-1])
corrMean = sorted_corr_mean[1:].transpose()

sorted_corr_median = pd.DataFrame(Median.corr()['Blueberry Has Split Truth'].iloc[Median.corr()['Blueberry Has Split Truth'].abs().argsort()][::-1])
corrMedian = sorted_corr_median[1:].transpose()

sorted_corr_min = pd.DataFrame(Min.corr()['Blueberry Has Split Truth'].iloc[Min.corr()['Blueberry Has Split Truth'].abs().argsort()][::-1])
corrMin = sorted_corr_min[1:].transpose()

sorted_corr_max = pd.DataFrame(Max.corr()['Blueberry Has Split Truth'].iloc[Max.corr()['Blueberry Has Split Truth'].abs().argsort()][::-1])
corrMax = sorted_corr_max[1:].transpose()

sorted_corr_misc = pd.DataFrame(Misc.corr()['Blueberry Has Split Truth'].iloc[Misc.corr()['Blueberry Has Split Truth'].abs().argsort()][::-1])
corrMisc = sorted_corr_misc[1:].transpose()

sorted_corr_frame = pd.DataFrame(frame.corr()['Blueberry Has Split Truth'].iloc[frame.corr()['Blueberry Has Split Truth'].abs().argsort()][::-1])
corrframe = sorted_corr_frame[1:].transpose()

# Ordered Correlation Tables

## Mean Correlation

In [10]:
# corrMean.mask(maskMean)
corrMean

Unnamed: 0,SOFT2_Mean,SOFT_Mean,IR_Mean,Sat_Mean,Blue_Mean,Lum_Mean,Red_Mean,Green_Mean,Hue_Mean
Blueberry Has Split Truth,-0.812992,-0.686413,-0.461738,-0.330108,-0.304003,-0.297898,-0.289756,-0.283953,-0.162661


## Median Correlation

In [11]:
# corrMedian.mask(maskMedian)
corrMedian

Unnamed: 0,SOFT2_Median,SOFT_Median,IR_Median,Sat_Median,Blue_Median,Lum_Median,Red_Median,Green_Median,Hue_Median
Blueberry Has Split Truth,-0.809216,-0.686425,-0.455767,-0.340532,-0.28537,-0.269906,-0.266253,-0.257023,-0.172359


## Min Correlation

In [12]:
# corrMin.mask(maskMin)
corrMin

Unnamed: 0,SOFT2_Min,SOFT_Min,IR_Min,Sat_Min,Blue_Min,Lum_Min,Green_Min,Red_Min,Hue_Min
Blueberry Has Split Truth,-0.789174,-0.674509,-0.442048,-0.435382,-0.271045,-0.254402,-0.237438,-0.224676,-0.212819


## Max Correlation

In [13]:
# corrMax.mask(maskMax)
corrMax

Unnamed: 0,SOFT2_Max,SOFT_Max,IR_Max,Red_Max,Lum_Max,Blue_Max,Green_Max,Sat_Max,Hue_Max
Blueberry Has Split Truth,-0.80154,-0.69107,-0.474863,-0.313648,-0.302258,-0.298228,-0.292941,-0.186543,-0.118139


## Misc Correlation

In [14]:
# corrMisc.mask(maskMisc)
corrMisc

Unnamed: 0,SOFT2_MeanXMin,SatLogSOFT2,Soft2RedRatio,SOFT2_MinMaxMean,Soft2IRRatio,Soft-Soft2abs
Blueberry Has Split Truth,-0.803717,-0.697123,-0.507796,-0.104791,0.089664,-0.036407


# Thresholding Chart

In [15]:
## set type interval
# choice = frame[frame_n['Lum_Mean'] <= frame_n['Blue_Mean']]
choice = frame

brush = alt.selection_interval()
n = len(choice[choice['Blueberry Has Split Truth'] == 1])
t = len(choice[choice['Blueberry Has Split Truth'] == 0])

chart = alt.Chart(choice).mark_circle(opacity = 0.2, size = 100).encode(
    x = 'Sat_Max',
    y = 'SOFT2_Mean',
    color = alt.condition(brush, 'Blueberry Has Split Truth:N', alt.value('lightgray'))
).add_selection(
    brush
)

# Base chart for Matrix
fp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 0]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush
)

tp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 1]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush
)

# Data Tables
test = fp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='False_Count')
test2 = tp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='True_Count')
chart | (test & test2)


### Manual labor

To calculate the confusion matrix, input the numbers below for <i>True</i> and <i>False</i>

In [16]:
TP, FP = 48, 1 # Replace these two numbers with the numbers calculated below
total_T, total_F = len(frame_n[frame_n['Blueberry Has Split Truth'] == 1]), len(frame_n[frame_n['Blueberry Has Split Truth'] == 0])

if(TP == 0 and FP == 0):
  print('TP and FP are set to 0, cannot continue')
else:
  rounding = 2
  FN, TN = total_T - TP, total_F - FP

  # Recall Good: How many of the good is classified as good?
  # TP/total_T
  r_bad = round(TP/total_T*100, rounding)

  # Recall Bad: How many of the bad is classified as bad?
  # TN/total_N
  r_good = round(TN/total_F*100, rounding)

  # Precision Good: When it predicts yes, how often is it correct? 
  # TP / (TP + FP)
  p_good = round(TP/(TP+FP)*100, rounding)

  # Precision Bad: When it predicts no, how often is it correct? 
  # TN / (TN + FN)
  p_bad = round(TN/(TN+FN)*100, rounding)

  # Misclassification Rate: Percentage of misclassified
  # (FN + FP)/ (total_T + total_F)
  m_rate = round((FN + FP)/(total_T + total_F), rounding)

  print('recall good: ' + str(r_good) + '%')
  print('recall bad: ' + str(r_bad)+ '%')
  print('Precision Good: ' + str(p_good)+ '%')
  print('Precision Bad: ' + str(p_bad)+ '%')
  print('Misclassification rate: ' + str(m_rate)+ '%')

recall good: 98.36%
recall bad: 87.27%
Precision Good: 97.96%
Precision Bad: 89.55%
Misclassification rate: 0.07%


#  <font color='gray'>Classification Plots</font>

## Mean Plot

In [17]:
## set type interval

choice = frame

brush = alt.selection_interval(encodings=['x'])

chart = alt.Chart(choice).mark_circle(opacity = 0.2, size = 100).encode(
    alt.Y('Blueberry Has Split Truth:N', title = 'SoftSpot Truth'),
    color = alt.condition(brush, 'Blueberry Has Split Truth:N', alt.value('lightgray'), legend = None),

).add_selection(
    brush
)

# Base chart for Matrix
fp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 0]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush
)

tp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 1]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush
)

print('Correlation')
rounding = 3
print([round(corrMean['Red_Mean']['Blueberry Has Split Truth'], rounding),round(corrMean['Blue_Mean']['Blueberry Has Split Truth'], rounding),round(corrMean['Green_Mean']['Blueberry Has Split Truth'], rounding)])
print([round(corrMean['Hue_Mean']['Blueberry Has Split Truth'], rounding),round(corrMean['Sat_Mean']['Blueberry Has Split Truth'], rounding),round(corrMean['Lum_Mean']['Blueberry Has Split Truth'], rounding)])
print([round(corrMean['IR_Mean']['Blueberry Has Split Truth'], rounding),round(corrMean['SOFT_Mean']['Blueberry Has Split Truth'], rounding),round(corrMean['SOFT2_Mean']['Blueberry Has Split Truth'], rounding)])

m_0 = fp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='False Count')
m_1 = tp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='True Count')

(chart.encode(x='Red_Mean:Q') | chart.encode(x='Blue_Mean:Q') | chart.encode(x='Green_Mean:Q')) & (chart.encode(x='Hue_Mean:Q') | chart.encode(x='Sat_Mean:Q') | chart.encode(x='Lum_Mean:Q'))  & (chart.encode(x='IR_Mean:Q') | chart.encode(x='SOFT_Mean:Q') | chart.encode(x='SOFT2_Mean:Q')) | (m_0 & m_1)

Correlation
[-0.29, -0.304, -0.284]
[-0.163, -0.33, -0.298]
[-0.462, -0.686, -0.813]


In [18]:
TP, FP = 24, 0 # Replace these two numbers with the numbers calculated below
total_T, total_F = len(frame_n[frame_n['Blueberry Has Split Truth'] == 1]), len(frame_n[frame_n['Blueberry Has Split Truth'] == 0])

if(TP == 0 and FP == 0):
  print('TP and FP are set to 0, cannot continue')
else:
  rounding = 2
  FN, TN = total_T - TP, total_F - FP

  # Recall Good: How many of the good is classified as good?
  # TP/total_T
  r_bad = round(TP/total_T*100, rounding)

  # Recall Bad: How many of the bad is classified as bad?
  # TN/total_N
  r_good = round(TN/total_F*100, rounding)

  # Precision Good: When it predicts yes, how often is it correct? 
  # TP / (TP + FP)
  p_good = round(TP/(TP+FP)*100, rounding)

  # Precision Bad: When it predicts no, how often is it correct? 
  # TN / (TN + FN)
  p_bad = round(TN/(TN+FN)*100, rounding)

  # Misclassification Rate: Percentage of misclassified
  # (FN + FP)/ (total_T + total_F)
  m_rate = round((FN + FP)/(total_T + total_F), rounding)

  print('recall good: ' + str(r_good) + '%')
  print('recall bad: ' + str(r_bad)+ '%')
  print('Precision Good: ' + str(p_good)+ '%')
  print('Precision Bad: ' + str(p_bad)+ '%')
  print('Misclassification rate: ' + str(m_rate)+ '%')

recall good: 100.0%
recall bad: 43.64%
Precision Good: 100.0%
Precision Bad: 66.3%
Misclassification rate: 0.27%


## Median Plot

In [19]:
## set type interval
choice = frame

brush2 = alt.selection_interval(encodings=['x'])

chart = alt.Chart(choice).mark_circle(opacity = 0.2, size = 100).encode(
    alt.Y('Blueberry Has Split Truth:N', title = 'Softspot Truth'),
    color = alt.condition(brush2, 'Blueberry Has Split Truth:N', alt.value('lightgray'), legend = None)
).add_selection(
    brush2
)

# Base chart for Matrix
fp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 0]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush2
)

tp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 1]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush2
)

print('Correlation')
rounding = 3
print([round(corrMedian['Red_Median']['Blueberry Has Split Truth'], rounding),round(corrMedian['Blue_Median']['Blueberry Has Split Truth'], rounding),round(corrMedian['Green_Median']['Blueberry Has Split Truth'], rounding)])
print([round(corrMedian['Hue_Median']['Blueberry Has Split Truth'], rounding),round(corrMedian['Sat_Median']['Blueberry Has Split Truth'], rounding),round(corrMedian['Lum_Median']['Blueberry Has Split Truth'], rounding)])
print([round(corrMedian['IR_Median']['Blueberry Has Split Truth'], rounding),round(corrMedian['SOFT_Median']['Blueberry Has Split Truth'], rounding),round(corrMedian['SOFT2_Median']['Blueberry Has Split Truth'], rounding)])

m_0 = fp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='False Count')
m_1 = tp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='True Count')

(chart.encode(x='Red_Median:Q') | chart.encode(x='Blue_Median:Q') | chart.encode(x='Green_Median:Q')) & (chart.encode(x='Hue_Median:Q') | chart.encode(x='Sat_Median:Q') | chart.encode(x='Lum_Median:Q'))  & (chart.encode(x='IR_Median:Q') | chart.encode(x='SOFT_Median:Q') | chart.encode(x='SOFT2_Median:Q')) | (m_0 & m_1)

Correlation
[-0.266, -0.285, -0.257]
[-0.172, -0.341, -0.27]
[-0.456, -0.686, -0.809]


In [20]:
TP, FP = 96, 11 # Replace these two numbers with the numbers calculated below
total_T, total_F = len(frame_n[frame_n['Blueberry Has Split Truth'] == 1]), len(frame_n[frame_n['Blueberry Has Split Truth'] == 0])

if(TP == 0 and FP == 0):
  print('TP and FP are set to 0, cannot continue')
else:
  rounding = 2
  FN, TN = total_T - TP, total_F - FP

  # Recall Good: How many of the good is classified as good?
  # TP/total_T
  r_bad = round(TP/total_T*100, rounding)

  # Recall Bad: How many of the bad is classified as bad?
  # TN/total_N
  r_good = round(TN/total_F*100, rounding)

  # Precision Good: When it predicts yes, how often is it correct? 
  # TP / (TP + FP)
  p_good = round(TP/(TP+FP)*100, rounding)

  # Precision Bad: When it predicts no, how often is it correct? 
  # TN / (TN + FN)
  p_bad = round(TN/(TN+FN)*100, rounding)

  # Misclassification Rate: Percentage of misclassified
  # (FN + FP)/ (total_T + total_F)
  m_rate = round((FN + FP)/(total_T + total_F), rounding)

  print('recall good: ' + str(r_good) + '%')
  print('recall bad: ' + str(r_bad)+ '%')
  print('Precision Good: ' + str(p_good)+ '%')
  print('Precision Bad: ' + str(p_bad)+ '%')
  print('Misclassification rate: ' + str(m_rate)+ '%')

recall good: 81.97%
recall bad: 174.55%
Precision Good: 89.72%
Precision Bad: 555.56%
Misclassification rate: -0.26%


# Min Plot

In [21]:
## set type interval
choice = frame
brush3 = alt.selection_interval(encodings=['x'])

chart = alt.Chart(choice).mark_circle(opacity = 0.2, size = 100).encode(
    alt.Y('Blueberry Has Split Truth:N', title = 'SoftSpot Truth'),
    color = alt.condition(brush3, 'Blueberry Has Split Truth:N', alt.value('lightgray'), legend = None)
).add_selection(
    brush3
)

# Base chart for Matrix
fp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 0]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush3
)

tp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 1]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush3
)

print('Correlation')
rounding = 3
print([round(corrMin['Red_Min']['Blueberry Has Split Truth'], rounding),round(corrMin['Blue_Min']['Blueberry Has Split Truth'], rounding),round(corrMin['Green_Min']['Blueberry Has Split Truth'], rounding)])
print([round(corrMin['Hue_Min']['Blueberry Has Split Truth'], rounding),round(corrMin['Sat_Min']['Blueberry Has Split Truth'], rounding),round(corrMin['Lum_Min']['Blueberry Has Split Truth'], rounding)])
print([round(corrMin['IR_Min']['Blueberry Has Split Truth'], rounding),round(corrMin['SOFT_Min']['Blueberry Has Split Truth'], rounding),round(corrMin['SOFT2_Min']['Blueberry Has Split Truth'], rounding)])

m_0 = fp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='False Count')
m_1 = tp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='True Count')

(chart.encode(x='Red_Min:Q') | chart.encode(x='Blue_Min:Q') | chart.encode(x='Green_Min:Q')) & (chart.encode(x='Hue_Min:Q') | chart.encode(x='Sat_Min:Q') | chart.encode(x='Lum_Min:Q'))  & (chart.encode(x='IR_Min:Q') | chart.encode(x='SOFT_Min:Q') | chart.encode(x='SOFT2_Min:Q')) | (m_0 & m_1)

Correlation
[-0.225, -0.271, -0.237]
[-0.213, -0.435, -0.254]
[-0.442, -0.675, -0.789]


In [22]:
TP, FP = 39, 0 # Replace these two numbers with the numbers calculated below
total_T, total_F = len(frame_n[frame_n['Blueberry Has Split Truth'] == 1]), len(frame_n[frame_n['Blueberry Has Split Truth'] == 0])

if(TP == 0 and FP == 0):
  print('TP and FP are set to 0, cannot continue')
else:
  rounding = 2
  FN, TN = total_T - TP, total_F - FP

  # Recall Good: How many of the good is classified as good?
  # TP/total_T
  r_bad = round(TP/total_T*100, rounding)

  # Recall Bad: How many of the bad is classified as bad?
  # TN/total_N
  r_good = round(TN/total_F*100, rounding)

  # Precision Good: When it predicts yes, how often is it correct? 
  # TP / (TP + FP)
  p_good = round(TP/(TP+FP)*100, rounding)

  # Precision Bad: When it predicts no, how often is it correct? 
  # TN / (TN + FN)
  p_bad = round(TN/(TN+FN)*100, rounding)

  # Misclassification Rate: Percentage of misclassified
  # (FN + FP)/ (total_T + total_F)
  m_rate = round((FN + FP)/(total_T + total_F), rounding)

  print('recall good: ' + str(r_good) + '%')
  print('recall bad: ' + str(r_bad)+ '%')
  print('Precision Good: ' + str(p_good)+ '%')
  print('Precision Bad: ' + str(p_bad)+ '%')
  print('Misclassification rate: ' + str(m_rate)+ '%')

recall good: 100.0%
recall bad: 70.91%
Precision Good: 100.0%
Precision Bad: 79.22%
Misclassification rate: 0.14%


## Max Plot

In [23]:
## set type interval

choice = frame
brush4 = alt.selection_interval(encodings=['x'])

chart = alt.Chart(choice).mark_circle(opacity = 0.2, size = 100).encode(
   alt.Y('Blueberry Has Split Truth:N', title = 'SoftSpot Truth'),
    color = alt.condition(brush4, 'Blueberry Has Split Truth:N', alt.value('lightgray'), legend = None)
).add_selection(
    brush4
)

# Base chart for Matrix
fp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 0]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush4
)

tp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 1]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush4
)

print('Correlation')
rounding = 3
print([round(corrMax['Red_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['Blue_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['Green_Max']['Blueberry Has Split Truth'], rounding)])
print([round(corrMax['Hue_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['Sat_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['Lum_Max']['Blueberry Has Split Truth'], rounding)])
print([round(corrMax['IR_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['SOFT_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['SOFT2_Max']['Blueberry Has Split Truth'], rounding)])

m_0 = fp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='False Count')
m_1 = tp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='True Count')

(chart.encode(x='Red_Max:Q') | chart.encode(x='Blue_Max:Q') | chart.encode(x='Green_Max:Q')) & (chart.encode(x='Hue_Max:Q') | chart.encode(x='Sat_Max:Q') | chart.encode(x='Lum_Max:Q'))  & (chart.encode(x='IR_Max:Q') | chart.encode(x='SOFT_Max:Q') | chart.encode(x='SOFT2_Max:Q')) | (m_0 & m_1)

Correlation
[-0.314, -0.298, -0.293]
[-0.118, -0.187, -0.302]
[-0.475, -0.691, -0.802]


In [24]:
TP, FP = 56, 21 # Replace these two numbers with the numbers calculated below
total_T, total_F = len(frame_n[frame_n['Blueberry Has Split Truth'] == 1]), len(frame_n[frame_n['Blueberry Has Split Truth'] == 0])

if(TP == 0 and FP == 0):
  print('TP and FP are set to 0, cannot continue')
else:
  rounding = 2
  FN, TN = total_T - TP, total_F - FP

  # Recall Good: How many of the good is classified as good?
  # TP/total_T
  r_bad = round(TP/total_T*100, rounding)

  # Recall Bad: How many of the bad is classified as bad?
  # TN/total_N
  r_good = round(TN/total_F*100, rounding)

  # Precision Good: When it predicts yes, how often is it correct? 
  # TP / (TP + FP)
  p_good = round(TP/(TP+FP)*100, rounding)

  # Precision Bad: When it predicts no, how often is it correct? 
  # TN / (TN + FN)
  p_bad = round(TN/(TN+FN)*100, rounding)

  # Misclassification Rate: Percentage of misclassified
  # (FN + FP)/ (total_T + total_F)
  m_rate = round((FN + FP)/(total_T + total_F), rounding)

  print('recall good: ' + str(r_good) + '%')
  print('recall bad: ' + str(r_bad)+ '%')
  print('Precision Good: ' + str(p_good)+ '%')
  print('Precision Bad: ' + str(p_bad)+ '%')
  print('Misclassification rate: ' + str(m_rate)+ '%')

recall good: 65.57%
recall bad: 101.82%
Precision Good: 72.73%
Precision Bad: 102.56%
Misclassification rate: 0.17%


# Misc Plot

In [25]:
## set type interval

choice = frame
brush5 = alt.selection_interval(encodings=['x'])

chart = alt.Chart(choice).mark_circle(opacity = 0.2, size = 100).encode(
    alt.Y('Blueberry Has Split Truth:N', title = 'SoftSpot Truth'),
    color = alt.condition(brush5, 'Blueberry Has Split Truth:N', alt.value('lightgray'), legend = None),
).add_selection(
    brush5
)

# Base chart for Matrix
fp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 0]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush5
)

tp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 1]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(   
    brush5
)

print('Correlation')
rounding = 3
print([round(corrMisc['SatLogSOFT2']['Blueberry Has Split Truth'], rounding),round(corrMisc['Soft2IRRatio']['Blueberry Has Split Truth'], rounding)])
print([round(corrMisc['SOFT2_MeanXMin']['Blueberry Has Split Truth'], rounding)])

m_0 = fp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='False Count')
m_1 = tp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='True Count')

(chart.encode(x='SatLogSOFT2:Q') | chart.encode(x='Soft2IRRatio:Q') |  chart.encode(x='Soft2RedRatio:Q')) & chart.encode(x='SOFT2_MeanXMin:Q') | (m_0 & m_1)

Correlation
[-0.697, 0.09]
[-0.804]


In [26]:
TP, FP = 97, 11 # Replace these two numbers with the numbers calculated below
total_T, total_F = len(frame_n[frame_n['Blueberry Has Split Truth'] == 1]), len(frame_n[frame_n['Blueberry Has Split Truth'] == 0])

if(TP == 0 and FP == 0):
  print('TP and FP are set to 0, cannot continue')
else:
  rounding = 2
  FN, TN = total_T - TP, total_F - FP

  # Recall Good: How many of the good is classified as good?
  # TP/total_T
  r_bad = round(TP/total_T*100, rounding)

  # Recall Bad: How many of the bad is classified as bad?
  # TN/total_N
  r_good = round(TN/total_F*100, rounding)

  # Precision Good: When it predicts yes, how often is it correct? 
  # TP / (TP + FP)
  p_good = round(TP/(TP+FP)*100, rounding)

  # Precision Bad: When it predicts no, how often is it correct? 
  # TN / (TN + FN)
  p_bad = round(TN/(TN+FN)*100, rounding)

  # Misclassification Rate: Percentage of misclassified
  # (FN + FP)/ (total_T + total_F)
  m_rate = round((FN + FP)/(total_T + total_F), rounding)

  print('recall good: ' + str(r_good) + '%')
  print('recall bad: ' + str(r_bad)+ '%')
  print('Precision Good: ' + str(p_good)+ '%')
  print('Precision Bad: ' + str(p_bad)+ '%')
  print('Misclassification rate: ' + str(m_rate)+ '%')

recall good: 81.97%
recall bad: 176.36%
Precision Good: 89.81%
Precision Bad: 625.0%
Misclassification rate: -0.27%


# Threshold Chart

In [52]:
# col = ['SOFT2_Median', 'SOFT2_Min', 'Sat_Median', 'Sat_Max']
# col = ['SOFT2_Median', 'SOFT2_Max', 'SOFT2_Min', 'Sat_Max', 'Sat_Median', 'Sat_Min']
# col = ['SOFT2_Max', 'SOFT_Max', 'IR_Max', 'Hue_Max', 'Sat_Max', 'Lum_Max']
# col = ['SOFT2_Max', 'SOFT2_Min', 'Sat_Max', 'SatLogSOFT2', 'SOFT2_MeanXMin']
# col = corrMean.columns.to_list() + corrMax.columns.to_list()
col = corrframe.columns[0:9].to_list()

test = complexCharts(frame, col)
t = test[0]
for i in range(1, len(col)):
  t &= test[i]
t

In [28]:
TP, FP = 49, 0 # Replace these two numbers with the numbers calculated below
total_T, total_F = len(frame_n[frame_n['Blueberry Has Split Truth'] == 1]), len(frame_n[frame_n['Blueberry Has Split Truth'] == 0])

if(TP == 0 and FP == 0):
  print('TP and FP are set to 0, cannot continue')
else:
  rounding = 2
  FN, TN = total_T - TP, total_F - FP

  # Recall Good: How many of the good is classified as good?
  # TP/total_T
  r_bad = round(TP/total_T*100, rounding)

  # Recall Bad: How many of the bad is classified as bad?
  # TN/total_N
  r_good = round(TN/total_F*100, rounding)

  # Precision Good: When it predicts yes, how often is it correct? 
  # TP / (TP + FP)
  p_good = round(TP/(TP+FP)*100, rounding)

  # Precision Bad: When it predicts no, how often is it correct? 
  # TN / (TN + FN)
  p_bad = round(TN/(TN+FN)*100, rounding)

  # Misclassification Rate: Percentage of misclassified
  # (FN + FP)/ (total_T + total_F)
  m_rate = round((FN + FP)/(total_T + total_F), rounding)

  print('recall good: ' + str(r_good) + '%')
  print('recall bad: ' + str(r_bad)+ '%')
  print('Precision Good: ' + str(p_good)+ '%')
  print('Precision Bad: ' + str(p_bad)+ '%')
  print('Misclassification rate: ' + str(m_rate)+ '%')

recall good: 100.0%
recall bad: 89.09%
Precision Good: 100.0%
Precision Bad: 91.04%
Misclassification rate: 0.05%


In [29]:
## set type interval

choice = frame
brush1 = alt.selection_interval(
    on="[mousedown[event.altKey], mouseup] > mousemove",
    name='brush1',
    encodings = ['x']
)
brush2 = alt.selection_interval(
    on="[mousedown[event.shiftKey], mouseup] > mousemove",
    mark=alt.BrushConfig(fill="#fdbb84", fillOpacity=0.5, stroke="#e34a33"),
    name='brush2',
    encodings = ['x']
)
brush3 = alt.selection_interval(
    on="[mousedown[event.ctrlKey], mouseup] > mousemove",
    encodings=['x'],
    mark=alt.BrushConfig(fill="#757575", fillOpacity=0.5, stroke="#e34a33")
  )
# brush4 = alt.selection_interval(encodings=['x'])
# brush5 = alt.selection_interval(encodings=['x'])

chart = alt.Chart(choice).mark_circle(opacity = 0.2, size = 100).encode(
    alt.Y('Blueberry Has Split Truth:N', title = 'SoftSpot Truth'),
    color = alt.condition(brush1 & brush2 & brush3, 'Blueberry Has Split Truth:N', alt.value('lightgray'))
).add_selection(
    brush1, brush2, brush3
)

# chart = alt.Chart(choice).mark_circle(opacity = 0.2, size = 100).encode(
#     y = 'Blueberry Has Split Truth:N'
# )

tbrush1 = alt.selection_interval(encodings = ['x'])
tbrush2 = alt.selection_interval(encodings = ['x'])
tbrush3 = alt.selection_interval(encodings = ['x'])

# Base chart for Matrix
fp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 0]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush1 & brush2 & brush3
)

tp = alt.Chart(choice[choice['Blueberry Has Split Truth'] == 1]).mark_text().encode(
  y=alt.Y('count:O',axis=None)
).transform_filter(
    brush1 & brush2 & brush3
)

# print('Correlation')
# rounding = 3
# print([round(corrMax['Red_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['Blue_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['Green_Max']['Blueberry Has Split Truth'], rounding)])
# print([round(corrMax['Hue_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['Sat_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['Lum_Max']['Blueberry Has Split Truth'], rounding)])
# print([round(corrMax['IR_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['SOFT_Max']['Blueberry Has Split Truth'], rounding),round(corrMax['SOFT2_Max']['Blueberry Has Split Truth'], rounding)])

m_0 = fp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='False Count')
m_1 = tp.encode(text = 'count():N', key = 'Blueberry Has Split Truth').properties(title='True Count')

cmax = (chart.encode(x='Red_Max:Q') | chart.encode(x='Blue_Max:Q') | chart.encode(x='Green_Max:Q')) & (chart.encode(x='Hue_Max:Q') | chart.encode(x='Sat_Max:Q') | chart.encode(x='Lum_Max:Q'))  & (chart.encode(x='IR_Max:Q') | chart.encode(x='SOFT_Max:Q') | chart.encode(x='SOFT2_Max:Q'))
cmin = (chart.encode(x='Red_Min:Q') | chart.encode(x='Blue_Min:Q') | chart.encode(x='Green_Min:Q')) & (chart.encode(x='Hue_Min:Q') | chart.encode(x='Sat_Min:Q') | chart.encode(x='Lum_Min:Q'))  & (chart.encode(x='IR_Min:Q') | chart.encode(x='SOFT_Min:Q') | chart.encode(x='SOFT2_Min:Q'))
cmedian = (chart.encode(x='Red_Median:Q') | chart.encode(x='Blue_Median:Q') | chart.encode(x='Green_Median:Q')) & (chart.encode(x='Hue_Median:Q') | chart.encode(x='Sat_Median:Q') | chart.encode(x='Lum_Median:Q'))  & (chart.encode(x='IR_Median:Q') | chart.encode(x='SOFT_Median:Q') | chart.encode(x='SOFT2_Median:Q'))
cmean = (chart.encode(x='Red_Mean:Q') | chart.encode(x='Blue_Mean:Q') | chart.encode(x='Green_Mean:Q')) & (chart.encode(x='Hue_Mean:Q') | chart.encode(x='Sat_Mean:Q') | chart.encode(x='Lum_Mean:Q'))  & (chart.encode(x='IR_Mean:Q') | chart.encode(x='SOFT_Mean:Q') | chart.encode(x='SOFT2_Mean:Q'))
cmisc = (chart.encode(x='SatLogSOFT2:Q') | chart.encode(x='Soft2IRRatio:Q') |  chart.encode(x='Soft2RedRatio:Q')) & chart.encode(x='SOFT2_MeanXMin:Q')
print(brush1, brush2, brush3)
(cmax & cmin & cmedian & cmean & cmisc & (m_0 & m_1)) | (m_0 & m_1)

# testmax = chart.encode(x='Red_Max:Q', color = alt.condition(tbrush1, 'Blueberry Has Split Truth:N', alt.value('lightgray'))).add_selection(tbrush1) | \
#   chart.encode(x='Blue_Max:Q', color = alt.condition(tbrush2, 'Blueberry Has Split Truth:N', alt.value('lightgray'))).add_selection(tbrush2)
# testmax & (m_0 | m_1)

Selection('brush1', SelectionDef({
  encodings: ['x'],
  on: '[mousedown[event.altKey], mouseup] > mousemove',
  type: 'interval'
})) Selection('brush2', SelectionDef({
  encodings: ['x'],
  mark: BrushConfig({
    fill: '#fdbb84',
    fillOpacity: 0.5,
    stroke: '#e34a33'
  }),
  on: '[mousedown[event.shiftKey], mouseup] > mousemove',
  type: 'interval'
})) Selection('selector011', SelectionDef({
  encodings: ['x'],
  mark: BrushConfig({
    fill: '#757575',
    fillOpacity: 0.5,
    stroke: '#e34a33'
  }),
  on: '[mousedown[event.ctrlKey], mouseup] > mousemove',
  type: 'interval'
}))


In [30]:
## 63 73 
TP, FP = 45, 0 # Replace these two numbers with the numbers calculated below
total_T, total_F = len(frame_n[frame_n['Blueberry Has Split Truth'] == 1]), len(frame_n[frame_n['Blueberry Has Split Truth'] == 0])

if(TP == 0 and FP == 0):
  print('TP and FP are set to 0, cannot continue')
else:
  rounding = 2
  FN, TN = total_T - TP, total_F - FP

  # Recall Good: How many of the good is classified as good?
  # TP/total_T
  r_bad = round(TP/total_T*100, rounding)

  # Recall Bad: How many of the bad is classified as bad?
  # TN/total_N
  r_good = round(TN/total_F*100, rounding)

  # Precision Good: When it predicts yes, how often is it correct? 
  # TP / (TP + FP)
  p_good = round(TP/(TP+FP)*100, rounding)

  # Precision Bad: When it predicts no, how often is it correct? 
  # TN / (TN + FN)
  p_bad = round(TN/(TN+FN)*100, rounding)

  # Misclassification Rate: Percentage of misclassified
  # (FN + FP)/ (total_T + total_F)
  m_rate = round((FN + FP)/(total_T + total_F), rounding)

  print('recall good: ' + str(r_good) + '%')
  print('recall bad: ' + str(r_bad)+ '%')
  print('Precision Good: ' + str(p_good)+ '%')
  print('Precision Bad: ' + str(p_bad)+ '%')
  print('Misclassification rate: ' + str(m_rate)+ '%')

recall good: 100.0%
recall bad: 81.82%
Precision Good: 100.0%
Precision Bad: 85.92%
Misclassification rate: 0.09%


# Example Altair

## Useful examples to get ideas from + code

In [0]:
import altair as alt
from vega_datasets import data

cars = data.cars.url

### Conditional variables with datum

In [32]:
import altair as alt
from vega_datasets import data

def plot_column(col_name: str) -> alt.Chart:
    source = data.us_employment()

    return alt.Chart(source).mark_bar().encode(
      x="month:T",
      y=f"{col_name}:Q", ## f is probably similar to C# version of $'{_variablename}'
      color=alt.condition(
          f"datum.{col_name} > 0", ## datum refers to the data source object given to chart
          alt.value("steelblue"),  # The positive color
          alt.value("orange")  # The negative color
      )
    ).properties(width=600)

plot_column("nonfarm_change")

### Custom Mouse Events

In [33]:
alex = alt.selection_interval(
    on="[mousedown[event.altKey], mouseup] > mousemove",
    name='alex'
)
morgan = alt.selection_interval(
    on="[mousedown[event.shiftKey], mouseup] > mousemove",
    mark=alt.BrushConfig(fill="#fdbb84", fillOpacity=0.5, stroke="#e34a33"),
    name='morgan'
)

alt.Chart(cars).mark_rect().encode(
    x='Cylinders:O',
    y='Origin:O',
    color=alt.condition(alex | morgan, 'count()', alt.ColorValue("grey"))
).add_selection(
    alex, morgan
).properties(
    width=300,
    height=180
)

# Rigid Line Plot

In [34]:
import altair as alt
from vega_datasets import data

source = data.seattle_weather.url

step = 20
overlap = 1

alt.Chart(source, height=step).transform_timeunit(
    Month='month(date)'
).transform_joinaggregate(
    mean_temp='mean(temp_max)', groupby=['Month']
).transform_bin(
    ['bin_max', 'bin_min'], 'temp_max'
).transform_aggregate(
    value='count()', groupby=['Month', 'mean_temp', 'bin_min', 'bin_max']
).transform_impute(
    impute='value', groupby=['Month', 'mean_temp'], key='bin_min', value=0
).mark_area(
    interpolate='monotone',
    fillOpacity=0.8,
    stroke='lightgray',
    strokeWidth=0.5
).encode(
    alt.X('bin_min:Q', bin='binned', title='Maximum Daily Temperature (C)'),
    alt.Y(
        'value:Q',
        scale=alt.Scale(range=[step, -step * overlap]),
        axis=None
    ),
    alt.Fill(
        'mean_temp:Q',
        legend=None,
        scale=alt.Scale(domain=[30, 5], scheme='redyellowblue')
    )
).facet(
    row=alt.Row(
        'Month:T',
        title=None,
        header=alt.Header(labelAngle=0, labelAlign='right', format='%B')
    )
).properties(
    title='Seattle Weather',
    bounds='flush'
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).configure_title(
    anchor='end'
)

# Becker’s Barley Trellis Plot (wrapped facet)

In [35]:


import altair as alt
from vega_datasets import data

source = data.barley.url

alt.Chart(source).mark_point().encode(
    alt.X('median(yield):Q', scale=alt.Scale(zero=False)),
    y='variety:O',
    color='year:N',
    facet=alt.Facet('site:O', columns=2),
).properties(
    width=200,
    height=100,
)



Changing X/Y Title

In [36]:
import altair as alt
from vega_datasets import data

source = data.barley.url

alt.Chart(source).mark_point().encode(
    x = alt.X('median(yield):Q', scale=alt.Scale(zero=False), title = 'New X title'),
    y = alt.X('variety:O', scale=alt.Scale(zero=False), title = 'New Y title'),
    color='year:N',
    facet=alt.Facet('site:O', columns=2),
).properties(
    width=200,
    height=100,
)