Skip to content

Commit

Permalink
Fix bin analyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
MircoT committed Jun 16, 2020
1 parent 58aee76 commit bccbb06
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 72 deletions.
5 changes: 5 additions & 0 deletions Probe/analyzer/__main__.py
Expand Up @@ -28,6 +28,10 @@ def main():
parser.add_argument('--output-folder', type=str,
default="analysis",
help='The output folder name [DEFAULT: "analysis"]')
parser.add_argument('--group-by', type=str,
choices=['d', 'w', 'm'],
default="d",
help='Group by day ("d"), week ("w") or month ("m") [DEFAULT: "d"]')
parser.add_argument('--feature-filename', type=str,
default="",
help='The feature JSON filename [DEFAULT: ""]')
Expand Down Expand Up @@ -80,6 +84,7 @@ def main():
region=args.region,
concatenated=args.concat,
output_folder=args.output_folder,
group_by=args.group_by,
)
print(f"{STATUS_ARROW}Analyze all bins...")
feature_bins = [elm for elm in args.feature_list.split(",") if elm]
Expand Down
192 changes: 120 additions & 72 deletions Probe/analyzer/features.py
Expand Up @@ -24,11 +24,14 @@ class Features(object):
def __init__(self, features: dict, df: 'pd.DataFrame',
concatenated: bool = True,
output_folder: str = "analysis",
region: str = 'all'):
self._df = df
self._concatenated = concatenated
self._region = region
region: str = 'all',
group_by: str = 'd'):
self._df: 'pd.DataFrame' = df
self._concatenated: bool = concatenated
self._region: str = region
self._group_by: str = group_by
self._filter_data(concatenated)
self._add_group_column()

self._output_folder = Path(output_folder)
self._output_folder.mkdir(parents=True, exist_ok=True)
Expand All @@ -44,6 +47,25 @@ def __init__(self, features: dict, df: 'pd.DataFrame',
self._features.append(key)
setattr(self, key, cur_values)

def _add_group_column(self):
if self._concatenated:
self._df['datetime'] = pd.to_datetime(self._df.reqDay, unit='s')
if self._group_by == 'd':
self._df['day'] = self._df.datetime.dt.day
elif self._group_by == 'w':
self._df['week'] = self._df.datetime.dt.week
elif self._group_by == 'm':
self._df['month'] = self._df.datetime.dt.month
else:
for cur_df in self._df:
cur_df['datetime'] = pd.to_datetime(cur_df.reqDay, unit='s')
if self._group_by == 'd':
cur_df['day'] = cur_df.datetime.dt.day
elif self._group_by == 'w':
cur_df['week'] = cur_df.datetime.dt.week
elif self._group_by == 'm':
cur_df['month'] = cur_df.datetime.dt.month

def _filter_data(self, concatenated: bool = True):
print(f"{STATUS_ARROW}Filter DataType data and mc")
if concatenated:
Expand Down Expand Up @@ -93,14 +115,13 @@ def check_all_features(self, features: List[str] = []):
cur_features.extend(self._features)
for feature in tqdm(cur_features,
desc=f"{STATUS_ARROW}Check features",
ascii=True, position=1):
ascii=True):
np_hist = self.check_bins_of(feature)
self.plot_bins_of(feature, np_hist)
self.plot_violin_of(feature, np_hist)

def check_bins_of(self, feature: str, n_bins: int = 6):
cur_bins = getattr(
self, feature) if feature in self._features else n_bins
all_data = None
if feature == 'size':
if self._concatenated:
sizes = (self._df['Size'] / 1024**2).astype(int).to_numpy()
Expand All @@ -109,56 +130,56 @@ def check_bins_of(self, feature: str, n_bins: int = 6):
for cur_df in tqdm(
self._df,
desc=f"{STATUS_ARROW}Calculate sizes x day",
ascii=True, position=0):
ascii=True):
sizes = np.concatenate([
sizes, (cur_df['Size'] / 1024 **
2).astype(int).to_numpy()
])
self._features_data[feature] = sizes
counts, bins = np.histogram(
sizes,
bins=cur_bins,
density=False
)
if feature in self._features:
return counts, bins
else:
return np.histogram(
sizes,
bins=bins.round(0),
density=False
)
all_data = sizes
elif feature == 'numReq':
files_x_day = None
groups = None
if self._concatenated:
files = self._df[['Filename', 'reqDay']]
files_x_day = files.groupby('reqDay')
if self._group_by == 'd':
groups = self._df.groupby('reqDay')
elif self._group_by == 'w':
groups = self._df.groupby('week')
elif self._group_by == 'm':
groups = self._df.groupby('month')
else:
files_x_day = [
(idx, cur_df[['Filename', 'reqDay']].copy())
for idx, cur_df in enumerate(self._df)
]
numReqXDay = np.array([])
for _, day in tqdm(files_x_day,
desc=f"{STATUS_ARROW}Calculate frequencies x day",
ascii=True, position=0):
numReqXDay = np.concatenate([
numReqXDay, day.Filename.value_counts().to_numpy()
if self._group_by == 'd':
groups = [
(idx, cur_df)
for idx, cur_df in enumerate(self._df)
]
else:
if self._group_by == 'w':
group_by = 'week'
elif self._group_by == 'm':
group_by = 'month'
groups = {}
for cur_df in self._df:
for week, cur_week in cur_df.groupby(group_by):
if week not in groups:
groups[week] = cur_week
else:
groups[week] = pd.concat([
groups[week],
cur_week,
], ignore_index=True)
groups = [
(group_key, groups[group_key])
for group_key in sorted(groups)
]
numReqXGroup = np.array([])
for _, group in tqdm(groups,
desc=f"{STATUS_ARROW}Calculate frequencies x day",
ascii=True):
numReqXGroup = np.concatenate([
numReqXGroup, group.Filename.value_counts().to_numpy()
])
self._features_data[feature] = numReqXDay
counts, bins = np.histogram(
numReqXDay,
bins=cur_bins,
density=False
)
if feature in self._features:
return counts, bins
else:
return np.histogram(
numReqXDay,
bins=bins.round(0),
density=False
)
self._features_data[feature] = numReqXGroup
all_data = numReqXGroup
elif feature == 'deltaLastRequest':
delta_files = []
files = {}
Expand All @@ -172,7 +193,7 @@ def check_bins_of(self, feature: str, n_bins: int = 6):
tot_files = len(all_files)
for idx, filename in tqdm(enumerate(all_files),
desc=f"{STATUS_ARROW}Calculate delta times",
ascii=True, position=0,
ascii=True,
total=tot_files):
if filename not in files:
files[filename] = idx
Expand All @@ -182,30 +203,45 @@ def check_bins_of(self, feature: str, n_bins: int = 6):
delta_files.append(cur_delta)
delta_files = np.array(delta_files)
self._features_data[feature] = delta_files
counts, bins = np.histogram(
delta_files,
bins=cur_bins,
density=False
)
if feature in self._features:
return counts, bins
else:
return np.histogram(
delta_files,
bins=bins.round(0),
density=False
)
all_data = delta_files
else:
raise Exception(
f"ERROR: feature {feature} can not be checked...")

if feature in self._features:
cur_bins = np.array(getattr(self, feature))
else:
_, cur_bins = np.histogram(
sizes,
bins=n_bins,
density=False
)

prev_bin = 0.
counts = []
for bin_idx, cur_bin in enumerate(cur_bins):
if bin_idx != cur_bins.shape[0] - 1:
cur_count = all_data[
(all_data > prev_bin) &
(all_data <= cur_bin)
].shape[0]
else:
cur_count = all_data[
(all_data > prev_bin)
].shape[0]
counts.append(cur_count)
prev_bin = cur_bin

counts = np.array(counts)
return counts, cur_bins

def plot_bins_of(self, feature: str, np_hist: tuple):
counts, bins = np_hist
# print(counts, bins)
percentages = (counts / counts.sum()) * 100.
percentages[np.isnan(percentages)] = 0.
fig = px.bar(
x=[str(cur_bin) for cur_bin in bins[1:]],
x=[str(cur_bin) for cur_bin in bins],
y=percentages,
title=f"Feature {feature}",
)
Expand Down Expand Up @@ -238,34 +274,46 @@ def plot_violin_of(self, feature: str, np_hist: tuple):
_, bins = np_hist
cur_feature_data = self._features_data[feature]
fig = go.Figure()
prev = bins[0]

fig.add_trace(
go.Violin(
y=cur_feature_data,
x0=-1,
x0=0,
name="global",
box_visible=True,
meanline_visible=True,
)
)
for cur_bin in bins[1:]:
cur_data = cur_feature_data[
(cur_feature_data >= prev) &
(cur_feature_data < cur_bin)
]
prev_bin = 0.
for bin_idx, cur_bin in enumerate(bins, 1):
if bin_idx != bins.shape[0]:
cur_data = cur_feature_data[
(cur_feature_data > prev_bin) &
(cur_feature_data <= cur_bin)
]
else:
cur_data = cur_feature_data[
(cur_feature_data > prev_bin)
]
fig.add_trace(
go.Violin(
y=cur_data,
x0=bin_idx,
name=str(cur_bin),
box_visible=True,
meanline_visible=True,
points="all",
# points="all",
)
)
prev = cur_bin
prev_bin = cur_bin
fig.update_layout(_LAYOUT)
fig.update_layout({
'title': f"Feature {feature}",
'xaxis': {
'tickmode': 'array',
'tickvals': list(range(len(bins)+1)),
'ticktext': ['global'] + [str(cur_bin) for cur_bin in bins]
}
})
# fig.show()
# print(f"{STATUS_ARROW}Save violin plot of {feature} as pnh")
Expand Down

0 comments on commit bccbb06

Please sign in to comment.