Fix bin analyzer

Cloud-PG · Jun 16, 2020 · bccbb06 · bccbb06
1 parent 58aee76
commit bccbb06
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 72 deletions.
diff --git a/Probe/analyzer/__main__.py b/Probe/analyzer/__main__.py
@@ -28,6 +28,10 @@ def main():
     parser.add_argument('--output-folder', type=str,
                         default="analysis",
                         help='The output folder name [DEFAULT: "analysis"]')
+    parser.add_argument('--group-by', type=str,
+                        choices=['d', 'w', 'm'],
+                        default="d",
+                        help='Group by day ("d"), week ("w") or month ("m") [DEFAULT: "d"]')
     parser.add_argument('--feature-filename', type=str,
                         default="",
                         help='The feature JSON filename [DEFAULT: ""]')
@@ -80,6 +84,7 @@ def main():
                 region=args.region,
                 concatenated=args.concat,
                 output_folder=args.output_folder,
+                group_by=args.group_by,
             )
             print(f"{STATUS_ARROW}Analyze all bins...")
             feature_bins = [elm for elm in args.feature_list.split(",") if elm]

diff --git a/Probe/analyzer/features.py b/Probe/analyzer/features.py
@@ -24,11 +24,14 @@ class Features(object):
     def __init__(self, features: dict, df: 'pd.DataFrame',
                  concatenated: bool = True,
                  output_folder: str = "analysis",
-                 region: str = 'all'):
-        self._df = df
-        self._concatenated = concatenated
-        self._region = region
+                 region: str = 'all',
+                 group_by: str = 'd'):
+        self._df: 'pd.DataFrame' = df
+        self._concatenated: bool = concatenated
+        self._region: str = region
+        self._group_by: str = group_by
         self._filter_data(concatenated)
+        self._add_group_column()
 
         self._output_folder = Path(output_folder)
         self._output_folder.mkdir(parents=True, exist_ok=True)
@@ -44,6 +47,25 @@ def __init__(self, features: dict, df: 'pd.DataFrame',
                 self._features.append(key)
                 setattr(self, key, cur_values)
 
+    def _add_group_column(self):
+        if self._concatenated:
+            self._df['datetime'] = pd.to_datetime(self._df.reqDay, unit='s')
+            if self._group_by == 'd':
+                self._df['day'] = self._df.datetime.dt.day
+            elif self._group_by == 'w':
+                self._df['week'] = self._df.datetime.dt.week
+            elif self._group_by == 'm':
+                self._df['month'] = self._df.datetime.dt.month
+        else:
+            for cur_df in self._df:
+                cur_df['datetime'] = pd.to_datetime(cur_df.reqDay, unit='s')
+                if self._group_by == 'd':
+                    cur_df['day'] = cur_df.datetime.dt.day
+                elif self._group_by == 'w':
+                    cur_df['week'] = cur_df.datetime.dt.week
+                elif self._group_by == 'm':
+                    cur_df['month'] = cur_df.datetime.dt.month
+
     def _filter_data(self, concatenated: bool = True):
         print(f"{STATUS_ARROW}Filter DataType data and mc")
         if concatenated:
@@ -93,14 +115,13 @@ def check_all_features(self, features: List[str] = []):
             cur_features.extend(self._features)
         for feature in tqdm(cur_features,
                             desc=f"{STATUS_ARROW}Check features",
-                            ascii=True, position=1):
+                            ascii=True):
             np_hist = self.check_bins_of(feature)
             self.plot_bins_of(feature, np_hist)
             self.plot_violin_of(feature, np_hist)
 
     def check_bins_of(self, feature: str, n_bins: int = 6):
-        cur_bins = getattr(
-            self, feature) if feature in self._features else n_bins
+        all_data = None
         if feature == 'size':
             if self._concatenated:
                 sizes = (self._df['Size'] / 1024**2).astype(int).to_numpy()
@@ -109,56 +130,56 @@ def check_bins_of(self, feature: str, n_bins: int = 6):
                 for cur_df in tqdm(
                         self._df,
                         desc=f"{STATUS_ARROW}Calculate sizes x day",
-                        ascii=True, position=0):
+                        ascii=True):
                     sizes = np.concatenate([
                         sizes, (cur_df['Size'] / 1024 **
                                 2).astype(int).to_numpy()
                     ])
             self._features_data[feature] = sizes
-            counts, bins = np.histogram(
-                sizes,
-                bins=cur_bins,
-                density=False
-            )
-            if feature in self._features:
-                return counts, bins
-            else:
-                return np.histogram(
-                    sizes,
-                    bins=bins.round(0),
-                    density=False
-                )
+            all_data = sizes
         elif feature == 'numReq':
-            files_x_day = None
+            groups = None
             if self._concatenated:
-                files = self._df[['Filename', 'reqDay']]
-                files_x_day = files.groupby('reqDay')
+                if self._group_by == 'd':
+                    groups = self._df.groupby('reqDay')
+                elif self._group_by == 'w':
+                    groups = self._df.groupby('week')
+                elif self._group_by == 'm':
+                    groups = self._df.groupby('month')
             else:
-                files_x_day = [
-                    (idx, cur_df[['Filename', 'reqDay']].copy())
-                    for idx, cur_df in enumerate(self._df)
-                ]
-            numReqXDay = np.array([])
-            for _, day in tqdm(files_x_day,
-                               desc=f"{STATUS_ARROW}Calculate frequencies x day",
-                               ascii=True, position=0):
-                numReqXDay = np.concatenate([
-                    numReqXDay, day.Filename.value_counts().to_numpy()
+                if self._group_by == 'd':
+                    groups = [
+                        (idx, cur_df)
+                        for idx, cur_df in enumerate(self._df)
+                    ]
+                else:
+                    if self._group_by == 'w':
+                        group_by = 'week'
+                    elif self._group_by == 'm':
+                        group_by = 'month'
+                    groups = {}
+                    for cur_df in self._df:
+                        for week, cur_week in cur_df.groupby(group_by):
+                            if week not in groups:
+                                groups[week] = cur_week
+                            else:
+                                groups[week] = pd.concat([
+                                    groups[week],
+                                    cur_week,
+                                ], ignore_index=True)
+                    groups = [
+                        (group_key, groups[group_key])
+                        for group_key in sorted(groups)
+                    ]
+            numReqXGroup = np.array([])
+            for _, group in tqdm(groups,
+                                 desc=f"{STATUS_ARROW}Calculate frequencies x day",
+                                 ascii=True):
+                numReqXGroup = np.concatenate([
+                    numReqXGroup, group.Filename.value_counts().to_numpy()
                 ])
-            self._features_data[feature] = numReqXDay
-            counts, bins = np.histogram(
-                numReqXDay,
-                bins=cur_bins,
-                density=False
-            )
-            if feature in self._features:
-                return counts, bins
-            else:
-                return np.histogram(
-                    numReqXDay,
-                    bins=bins.round(0),
-                    density=False
-                )
+            self._features_data[feature] = numReqXGroup
+            all_data = numReqXGroup
         elif feature == 'deltaLastRequest':
             delta_files = []
             files = {}
@@ -172,7 +193,7 @@ def check_bins_of(self, feature: str, n_bins: int = 6):
                 tot_files = len(all_files)
             for idx, filename in tqdm(enumerate(all_files),
                                       desc=f"{STATUS_ARROW}Calculate delta times",
-                                      ascii=True, position=0,
+                                      ascii=True,
                                       total=tot_files):
                 if filename not in files:
                     files[filename] = idx
@@ -182,30 +203,45 @@ def check_bins_of(self, feature: str, n_bins: int = 6):
                     delta_files.append(cur_delta)
             delta_files = np.array(delta_files)
             self._features_data[feature] = delta_files
-            counts, bins = np.histogram(
-                delta_files,
-                bins=cur_bins,
-                density=False
-            )
-            if feature in self._features:
-                return counts, bins
-            else:
-                return np.histogram(
-                    delta_files,
-                    bins=bins.round(0),
-                    density=False
-                )
+            all_data = delta_files
         else:
             raise Exception(
                 f"ERROR: feature {feature} can not be checked...")
 
+        if feature in self._features:
+            cur_bins = np.array(getattr(self, feature))
+        else:
+            _, cur_bins = np.histogram(
+                sizes,
+                bins=n_bins,
+                density=False
+            )
+
+        prev_bin = 0.
+        counts = []
+        for bin_idx, cur_bin in enumerate(cur_bins):
+            if bin_idx != cur_bins.shape[0] - 1:
+                cur_count = all_data[
+                    (all_data > prev_bin) &
+                    (all_data <= cur_bin)
+                ].shape[0]
+            else:
+                cur_count = all_data[
+                    (all_data > prev_bin)
+                ].shape[0]
+            counts.append(cur_count)
+            prev_bin = cur_bin
+
+        counts = np.array(counts)
+        return counts, cur_bins
+
     def plot_bins_of(self, feature: str, np_hist: tuple):
         counts, bins = np_hist
         # print(counts, bins)
         percentages = (counts / counts.sum()) * 100.
         percentages[np.isnan(percentages)] = 0.
         fig = px.bar(
-            x=[str(cur_bin) for cur_bin in bins[1:]],
+            x=[str(cur_bin) for cur_bin in bins],
             y=percentages,
             title=f"Feature {feature}",
         )
@@ -238,34 +274,46 @@ def plot_violin_of(self, feature: str, np_hist: tuple):
         _, bins = np_hist
         cur_feature_data = self._features_data[feature]
         fig = go.Figure()
-        prev = bins[0]
+
         fig.add_trace(
             go.Violin(
                 y=cur_feature_data,
-                x0=-1,
+                x0=0,
                 name="global",
                 box_visible=True,
                 meanline_visible=True,
             )
         )
-        for cur_bin in bins[1:]:
-            cur_data = cur_feature_data[
-                (cur_feature_data >= prev) &
-                (cur_feature_data < cur_bin)
-            ]
+        prev_bin = 0.
+        for bin_idx, cur_bin in enumerate(bins, 1):
+            if bin_idx != bins.shape[0]:
+                cur_data = cur_feature_data[
+                    (cur_feature_data > prev_bin) &
+                    (cur_feature_data <= cur_bin)
+                ]
+            else:
+                cur_data = cur_feature_data[
+                    (cur_feature_data > prev_bin)
+                ]
             fig.add_trace(
                 go.Violin(
                     y=cur_data,
+                    x0=bin_idx,
                     name=str(cur_bin),
                     box_visible=True,
                     meanline_visible=True,
-                    points="all",
+                    # points="all",
                 )
             )
-            prev = cur_bin
+            prev_bin = cur_bin
         fig.update_layout(_LAYOUT)
         fig.update_layout({
             'title': f"Feature {feature}",
+            'xaxis': {
+                'tickmode': 'array',
+                'tickvals': list(range(len(bins)+1)),
+                'ticktext': ['global'] + [str(cur_bin) for cur_bin in bins]
+            }
         })
         # fig.show()
         # print(f"{STATUS_ARROW}Save violin plot of {feature} as pnh")