<a href="https://colab.research.google.com/github/Srikanth635/COMNETS/blob/main/Source_Code/JSONExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import json
import re
import pickle
import shutil
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy as sp
from scipy import stats

### Segmenting of Information from JSON file

In [None]:
class json_segmentation:
    
    def __init__(self):
        pass
    
    def function1(self,df):
        #convert simulations column into series from original JSON
        sim_df = df.simulations.apply(pd.Series)
        #convert meta column into series from sim_df
        sim_meta_df = sim_df.meta.apply(pd.Series)
        #
        sim_df.sim_state_times = sim_df.sim_state_times.apply(lambda x : json.loads(x))
        sim_state_times_df = sim_df.sim_state_times.apply(pd.Series)
        #
        meta_scalar_columns = [content[0]+'_'+content[1] for index,content in enumerate(sim_meta_df.scalar_stats[0])]
        #
        matrix = np.zeros(len(meta_scalar_columns))
        for i in range(len(sim_meta_df.scalar_stats)):
            values = [content[2] for index,content in enumerate(sim_meta_df.scalar_stats[i])]
            matrix = np.vstack((matrix,values))
        #
        meta_scalar_df = pd.DataFrame(data=matrix,columns=meta_scalar_columns)
#         temp_df = meta_scalar_df.copy()
        # index resetting --------
        meta_scalar_df.drop(axis=0,index=0,inplace=True)
        meta_scalar_df.reset_index(inplace=True)
        meta_scalar_df.drop('index',axis=1,inplace=True)
        #
        meta_sim_timeStats_columns = [content[0] for index,content in enumerate(sim_meta_df.sim_runtime_stats[0])]
        #
        matrix1 = np.zeros(len(meta_sim_timeStats_columns))
        for i in range(len(sim_meta_df.sim_runtime_stats)):
            values = [content[1] for index,content in enumerate(sim_meta_df.sim_runtime_stats[i])]
            matrix1 = np.vstack((matrix1,values))
        #
        meta_sim_timeStats_df = pd.DataFrame(data=matrix1,columns=meta_sim_timeStats_columns)
        # index resetting --------
        meta_sim_timeStats_df.drop(axis=0,index=0,inplace=True)
        meta_sim_timeStats_df.reset_index(inplace=True)
        meta_sim_timeStats_df.drop('index',axis=1,inplace=True)
        #
        sim_meta_df.drop(['scalar_stats','sim_runtime_stats'],axis=1,inplace=True)
        sim_df.drop(['sim_state_times','meta'],axis=1,inplace=True)
        # 
        df_merge = [sim_df,sim_meta_df,meta_scalar_df,meta_sim_timeStats_df,sim_state_times_df]
        df_merge = [sim_df,sim_meta_df,meta_scalar_df,meta_sim_timeStats_df]
        segregated_df = pd.concat(df_merge,axis=1)
        
        return segregated_df

### Information Extraction from segmented JSON

In [None]:
class ini_feature_extraction:
    def __init__(self):
        pass
    
    def feature_separation(self,segmented_df):
        #
        feat_dict = {
            'numNodes' : [],'dataGenerationInterval' : [],'dataSizeInBytes' : [],'constraintAreaMaxX' : [],'constraintAreaMaxY' : [],
            'noOfLocations' : [],'Hosts' : [],'speed' : [],'forwardingLayer' : [],'maximumCacheSize' : [],'app_layer':[]
                }
        for index,row in segmented_df.iterrows():
            strs = segmented_df['omnetppini'][index]
            cur_runcon = segmented_df['runconfig'][index]
#             runcon_list = re.findall(r'\[([A-Za-z0-9\s\-]{2,})\]',strs)
  
#             numNodes = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?numNodes = (\d{1,})').findall(strs)
            numNodes = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\{\.\*\d\)\(\#\+\,\\n]+numNodes = (\d{1,})').findall(strs)
            app_dataGenInterval = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?dataGenerationInterval = (\d{1,})').findall(strs)
#             app_dataSizeBytes = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?dataSizeInBytes = (\d{1,})').findall(strs)        
            app_dataSizeBytes = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\{\.\*\d\)\(\#\+\,\\n]+dataSizeInBytes = (\d{1,})').findall(strs)        
            mob_area_maxX = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?constraintAreaMaxX = (\d{1,})').findall(strs)
            mob_area_maxY = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?constraintAreaMaxY = (\d{1,})').findall(strs)           
            mob_locations = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?noOfLocations = (\d{1,})').findall(strs)
            mob_hosts = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?Hosts = (\d{1,})').findall(strs)
            mob_speed = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?speed = (\d{1,}\.\d{1,}|\d{1,})').findall(strs)
           
            #forwarding
            forw_layer = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?forwardingLayer = \"([A-Za-z]+)\"').findall(strs)
#             maximumCacheSize = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\[\{\.\*\d\)\(\#\+\,\\n]+?maximumCacheSize = (\d+)').findall(strs)
            maximumCacheSize = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\{\.\*\d\)\(\#\+\,\\n]+maximumCacheSize = (\d+)').findall(strs)
            
            app_layer = re.compile(cur_runcon +r'[\]\n\w\s\=\-\/\"\$\}\{\.\*\d\)\(\#\+\,\\n]+applicationLayer = "([\w]+)"').findall(strs)

            if len(numNodes)==0:
                numNodes = re.compile(r'numNodes = (\d+)').findall(strs)
                
            if bool(numNodes):
                feat_dict['numNodes'].append(numNodes[0])
            else:
                feat_dict['numNodes'].append(0)
                
            if bool(app_dataGenInterval):
                feat_dict['dataGenerationInterval'].append(app_dataGenInterval[0])
            else:
                feat_dict['dataGenerationInterval'].append(0)
                
            if bool(app_dataSizeBytes):
                feat_dict['dataSizeInBytes'].append(app_dataSizeBytes[0])
            else:
                feat_dict['dataSizeInBytes'].append(0)
                
            if bool(mob_area_maxX):
                feat_dict['constraintAreaMaxX'].append(mob_area_maxX[0])
            else:
                feat_dict['constraintAreaMaxX'].append(0)
                
            if bool(mob_area_maxY):
                feat_dict['constraintAreaMaxY'].append(mob_area_maxY[0])
            else:
                feat_dict['constraintAreaMaxY'].append(0)
                
            if bool(mob_locations):
                feat_dict['noOfLocations'].append(mob_locations[0])
            else:
                feat_dict['noOfLocations'].append(0)
                
            if bool(mob_hosts):
                feat_dict['Hosts'].append(mob_hosts[0])
            else:
                feat_dict['Hosts'].append(0)
                
            if bool(mob_speed):
                feat_dict['speed'].append(mob_speed[0])
            else:
                feat_dict['speed'].append(0)
                
            if bool(forw_layer): 
                feat_dict['forwardingLayer'].append(forw_layer[0])
            else:
                feat_dict['forwardingLayer'].append(0)

            if bool(maximumCacheSize): 
                feat_dict['maximumCacheSize'].append(maximumCacheSize[0])
            else:
                feat_dict['maximumCacheSize'].append(0)
                
            if bool(app_layer): 
                feat_dict['app_layer'].append(app_layer[0])
            else:
                feat_dict['app_layer'].append(0)
        return feat_dict

### Converting the extracted infomration into pandas dataframe

In [None]:
class dataSetGenerator:
    def __init__(self):
        pass
    def dataSetGen(self,df_to_segment):
        bms = ['General','Config Benchmark-04-Conference-Scenario','Config Benchmark-07-Office-Scenario','Config Benchmark-05-University-Scenario',
          'Config Benchmark-06-Roller-Skate-Scenario','Config Benchmark-02-Typhoon-Disaster-Scenario']
        target_df = df_to_segment[['runconfig','peak_disk_usage', 'peak_sim_ram_usage','peak_results_ram_usage','totaljobclocktimesec']]
        new_segmented_df = df_to_segment[~df_to_segment.runconfig.isin(bms)].reset_index(drop=True)
        new_target_df = target_df[~target_df.runconfig.isin(bms)].reset_index(drop=True)
        print("Segments : ",new_segmented_df.shape,new_target_df.shape)
        new_ini_obj = ini_feature_extraction()
        feat_xtrac = new_ini_obj.feature_separation(new_segmented_df)
        new_feature_df = pd.DataFrame(feat_xtrac)
        full_df = pd.concat([new_feature_df,new_target_df],axis=1)
        final_df = full_df.drop(full_df[(full_df.Hosts==0)|(full_df.peak_results_ram_usage==0)|
                                        (full_df.peak_sim_ram_usage==0)|(full_df.peak_disk_usage==0)|(full_df.constraintAreaMaxX==0)|(full_df.constraintAreaMaxY==0)].index)
        final_df = final_df.drop('speed',axis=1)
        object_list = ['numNodes', 'dataGenerationInterval', 'dataSizeInBytes','constraintAreaMaxX','constraintAreaMaxY', 
       'noOfLocations', 'Hosts', 'maximumCacheSize','totaljobclocktimesec']
        final_df[object_list] = final_df[object_list].apply(pd.to_numeric)
        
       
        print("Final df shape : ", final_df.shape)
        return final_df