# Analytics

#### Date: 2020/02

#### SUMMARY:

- This notebook represents the project quality analysis of the date exposed right above. 

### TEAM:

##### Semester: 2020/02
##### Professor: Hilmer Neri

##### Members:

- Member x
- Member y

### LIBRARIES

In [2]:
# Deal with data
import pandas as pd
import numpy as np
import json
from glob import glob
import os

# Deal with API request
import urllib3
from urllib3 import request

# Deal with visualization
import seaborn as sns
import matplotlib.pyplot as plt

### GRAPH SETTINGS

In [3]:
%config InlineBackend.figure_format ='retina'
sns.set(font_scale=1.5)
sns.set_style('darkgrid',
              {'xtick.bottom' : True,
               'ytick.left': True,
               'grid.linestyle':'--',
               'font.monospace': ['Computer Modern Typewriter'],
               'axes.edgecolor' : 'white'})

### DATAFRAME SETTINGS

In [4]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

### SonarCloud

##### Path to the folder with all your jsons

In [5]:
jsons = glob('**/*.json') # add the path here

In [6]:
def read_json(json_path):
    
    with open(json_path) as json_file:
        json_obj = json.load(json_file)
        
    return json_obj

def create_base_component_df(json_list):
    
    df = pd.DataFrame()

    for i in json_list:

        base_component = read_json(i)

        base_component_data = base_component['baseComponent']['measures']

        base_component_df = pd.DataFrame(base_component_data)

        base_component_df['filename'] = os.path.basename(i)

        df = df.append(base_component_df, ignore_index=True)
        
    aux_df = df['filename'].str.split(r"fga-eps-mds-2020_2-(.*?)-(.*?).json", expand=True)
    
    df['repository'] = aux_df[1]
    
    df['version'] = aux_df[2]
    
    df = df.sort_values(by=['repository', 'version'])
        
    return df

#### Create base component dataframe

In [7]:
base_component_df = create_base_component_df(jsons)

In [9]:
base_component_df.head(10)

Unnamed: 0,metric,value,bestValue,filename,repository,version
611,duplicated_lines_density,0.0,True,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
612,functions,42.0,,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
613,test_execution_time,70.0,,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
614,security_rating,1.0,True,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
615,tests,21.0,,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
616,files,19.0,,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
617,complexity,60.0,,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
618,ncloc,626.0,,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
619,coverage,68.2,False,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
620,reliability_rating,1.0,True,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00


#### Create dataframe per file

In [10]:
metric_list = ['files',
               'functions',
               'complexity',
               'comment_lines_density',
               'duplicated_lines_density',
               'coverage',
               'ncloc',
               'security_rating',
               'tests',
               'test_success_density',
               'test_execution_time',
               'reliability_rating']

len(metric_list)

12

In [11]:
def metric_per_file(json):
    
    file_json = []
    
    for component in json['components']:
        if component['qualifier'] == 'FIL':
            file_json.append(component)
            
    return file_json

def generate_file_dataframe_per_release(metric_list, json, language_extension):
    
    df_columns = metric_list
    df = pd.DataFrame(columns = df_columns)
    
    for file in json:
        try:
            if file['language'] == language_extension:
                for measure in file['measures']:
                    df.at[file['path'], measure['metric']] = measure['value']
        except:
            pass
        
    df.reset_index(inplace = True)
    df = df.rename({'index': 'path'}, axis=1).drop(['files'], axis=1)

    return df

def create_file_df(json_list):
    
    df = pd.DataFrame()

    for i in json_list:

        file_component = read_json(i)
        
        file_component_data = metric_per_file(file_component)

        file_component_df = generate_file_dataframe_per_release(metric_list, file_component_data, language_extension = 'ts')

        file_component_df['filename'] = os.path.basename(i)

        df = df.append(file_component_df, ignore_index=True)
        
    # replace TeamName by yours.    
    aux_df = df['filename'].str.split(r"fga-eps-mds-2020_2-(.*?)-(.*?).json", expand=True)
    
    df['repository'] = aux_df[1]
    
    df['version'] = aux_df[2]
    
    df = df.sort_values(by=['repository', 'version'])
        
    return df

In [12]:
file_component_df = create_file_df(jsons)

In [13]:
file_component_df.head(10)

Unnamed: 0,path,functions,complexity,comment_lines_density,duplicated_lines_density,coverage,ncloc,security_rating,tests,test_success_density,test_execution_time,reliability_rating,filename,repository,version
724,src/migration/1618526567509-CreateComplaint.ts,2,2,0.0,0.0,0.0,76,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
725,src/migration/1618529825437-CreateVote.ts,2,2,0.0,0.0,0.0,38,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
726,src/utils/Category.ts,0,0,0.0,0.0,100.0,5,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
727,src/entity/Complaint.ts,0,0,0.0,0.0,100.0,36,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
728,src/repositories/ComplaintRepository.ts,7,7,0.0,0.0,61.3,80,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
729,src/utils/ComplaintUpvote.ts,1,3,0.0,0.0,63.6,16,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
730,src/utils/ComplaintVote.ts,0,0,0.0,0.0,,9,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
731,src/utils/ComplaintVoteConfirmed.ts,1,3,0.0,0.0,92.3,18,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
732,src/utils/ComplaintWithVote.ts,0,0,0.0,0.0,,18,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00
733,src/controllers/ControllerComplaint.ts,13,24,0.0,0.0,90.3,170,1.0,,100.0,,1.0,fga-eps-mds-2020_2-Eccoar_Complaint-01-05-2021...,Eccoar_Complaint,01-05-2021-00:00


In [14]:
file_component_df.to_excel('data/data.xlsx', index = False)

#### Create dataframe per repository

In [15]:
# Example

eccoar_complaint_df = file_component_df[file_component_df['repository'] == 'Eccoar_Complaint']
eccoar_gateway_df = file_component_df[file_component_df['repository'] == 'Eccoar_Gateway']
eccoar_frontend_df = file_component_df[file_component_df['repository'] == 'eccoar_frontend']
eccoar_reports_df = file_component_df[file_component_df['repository'] == 'Eccoar_Reports']
eccoar_mailer_df = file_component_df[file_component_df['repository'] == 'Eccoar_Mailer']
eccoar_users_df = file_component_df[file_component_df['repository'] == 'Eccoar_Users']
...

Ellipsis

### Metric calculations

##### COMPLEXITY

In [16]:
def m1(df):
    
    density_non_complex_files = len(df[(df['complexity'].astype(float)/df['functions'].astype(float)) < 10])/len(df)
    
    return density_non_complex_files

##### COMMENTS

In [17]:
def m2(df):
    
    density_comment_files = len(df[(df['comment_lines_density'].astype(float) > 10) & (df['comment_lines_density'].astype(float) < 30)])/len(df)
    
    return density_comment_files

##### DUPLICATIONS

In [18]:
def m3(df):
    
    duplication = len(df[(df['duplicated_lines_density'].astype(float) < 5)])/len(df)
    
    return duplication

In [19]:
##### RESOLVED ISSUES' THROUGHPUT

In [20]:
def m7(number_of_issues_resolved, number_of_issues):
    
    resolved_issues_throughput = round((number_of_issues_resolved / number_of_issues) * 100, 2)
    
    return resolved_issues_throughput

In [21]:
##### ISSUE TYPE IN A TIMEFRAME


In [22]:
def density(issue, number_of_issues):
    issue_density = round((issue / number_of_issues) * 100, 2)
    return issue_density

In [23]:
def m8(tag_dict, number_of_issues):
    
    issue_densities = {
        "hotfix": [density(tag_dict["HOTFIX"], number_of_issues)],
        "docs": [density(tag_dict["DOCS"], number_of_issues)],
        "feature": [density(tag_dict["FEATURE"], number_of_issues)],
        "arq": [density(tag_dict["ARQ"], number_of_issues)],
        "devops": [density(tag_dict["DEVOPS"], number_of_issues)],
        "analytics": [density(tag_dict["ANALYTICS"], number_of_issues)],
        "us": [density(tag_dict["US"], number_of_issues)],
        "easy": [density(tag_dict["EASY"], number_of_issues)],
        "medium": [density(tag_dict["MEDIUM"], number_of_issues)],
        "hard": [density(tag_dict["HARD"], number_of_issues)],
        "eps": [density(tag_dict["EPS"], number_of_issues)],
        "mds": [density(tag_dict["MDS"], number_of_issues)]
    }

    issue_densities = pd.DataFrame.from_dict(issue_densities).T.reset_index()
    
    issue_densities.columns = ['density' ,'percentage']
    
    return issue_densities

In [24]:
##### BUGS RATIO

In [25]:
def m9(tag_dict, number_of_issues):

    bugs_ratio = round(((tag_dict["DOCS"] + tag_dict["FEATURE"] + tag_dict["ARQ"] + tag_dict["DEVOPS"] + tag_dict["ANALYTICS"]) / number_of_issues) * 100, 2)
    
    return bugs_ratio

In [42]:
TAGS = {
    'HOTFIX': 15,
    'DOCS': 121,
    'FEATURE': 32,
    'ARQ': 15,
    'DEVOPS': 12,
    'ANALYTICS': 23,
    'US': 19,
    'EASY': 28,
    'MEDIUM': 22,
    'HARD': 7,
    'EPS': 61,
    'MDS': 41
}
NUMBER_OF_ISSUES_RESOLVED=201
NUMBER_OF_ISSUES=236

### Calculate m1, m2 and m3 for each repository

In [27]:
def create_metrics_df(df):
    
    version_vec = df['version'].unique()
    
    m1_list = []
    m2_list = []
    m3_list = []
    m7_list = []
    m8_list = []
    m9_list = []
    repository_list = []
    version_list = []
    
    metrics_df = pd.DataFrame()
    
    for version in version_vec:

        version_df = df[df['version'] == version]

        m1_list.append(m1(version_df))
        m2_list.append(m2(version_df))
        m3_list.append(m3(version_df))
        m7_list.append(m7(NUMBER_OF_ISSUES_RESOLVED, NUMBER_OF_ISSUES))
        m8_list.append(m8(TAGS, NUMBER_OF_ISSUES))
        m9_list.append(m9(TAGS, NUMBER_OF_ISSUES))
        repository_list.append(version_df['repository'].iloc[0])
        version_list.append(version)
        
    metrics_df = pd.DataFrame({'m1': m1_list,
                               'm2': m2_list,
                               'm3': m3_list,
                               'm7': m7_list,
                               'm8': m8_list,
                               'm9': m9_list,
                               'repository': repository_list, 
                               'version': version_list})
        
    return metrics_df

In [28]:
eccoar_gateway_metrics = create_metrics_df(eccoar_gateway_df)
eccoar_complaint_metrics = create_metrics_df(eccoar_complaint_df)
eccoar_frontend_metrics = create_metrics_df(eccoar_frontend_df)
eccoar_mailer_metrics = create_metrics_df(eccoar_mailer_df)
eccoar_reports_metrics = create_metrics_df(eccoar_reports_df)
eccoar_users_metrics = create_metrics_df(eccoar_users_df)
file_component_metrics = create_metrics_df(file_component_df)
...

Ellipsis

### Data visualization

- You must do this for each of your repositories

In [29]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m1'], linewidth=3, marker='o', markersize=10)

NameError: name 'repo1' is not defined

<Figure size 1440x720 with 0 Axes>

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m2'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m3'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['m1'], linewidth=3, marker='o', markersize=10)
plt.plot(repo1['m2'], linewidth=3, marker='o', markersize=10)
plt.plot(repo1['m3'], linewidth=3, marker='o', markersize=10)

### Sub characteristic aggregation

- You must do this for each of your repositories

In [30]:
psc1 = 1
pm1 = 0.33
pm2 = 0.33
pm3 = 0.33

repo1['asc1'] = ((repo1['m1']*pm1)+(repo1['m2']*pm2)+(repo1['m3']*pm3))*psc1
repo2['asc1'] = ((repo2['m1']*pm1)+(repo2['m2']*pm2)+(repo2['m3']*pm3))*psc1
...

NameError: name 'repo1' is not defined

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo1['asc1'], linewidth=3, marker='o', markersize=10)

In [None]:
fig = plt.figure(figsize=(20, 10))

plt.plot(repo2['asc1'], linewidth=3, marker='o', markersize=10)

In [31]:
fig = plt.figure(figsize=(20, 10))


plt.plot(repo1['asc1'], linewidth=3, marker='o', markersize=5)
plt.plot(repo2['asc1'], linewidth=3, marker='o', markersize=5)
...

NameError: name 'repo1' is not defined

<Figure size 1440x720 with 0 Axes>

In [32]:
metrics_df = pd.concat([repo1_metrics, repo2_metrics, ...], ignore_index=True)

metrics_df['ac1'] = metrics_df['asc1'] * 1
metrics_df['total'] = metrics_df['asc1'] * 1

NameError: name 'repo1_metrics' is not defined

In [None]:
metrics_df

In [33]:
metrics_df.to_excel('data/metrics_df.xlsx', index = False)

NameError: name 'metrics_df' is not defined

# Descriptive Statistic Analysis

- Realizes the calculation of statistic data such as mean, median, mode, min, max, standard deviation and variance.

In [34]:
def descriptive_statistics(df):
    
    metrics = df.describe()
    variance = df.var()
    variance_df = pd.DataFrame(variance, columns=["var"])
    variance_df = variance_df.T
    return metrics.append(variance_df).T

In [35]:
descriptive_statistics(eccoar_complaint_metrics)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var
m1,10.0,0.610169,0.066453,0.533333,0.561404,0.583591,0.666667,0.7,0.004416
m2,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m3,10.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [36]:
descriptive_statistics(eccoar_gateway_metrics)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var
m1,10.0,0.928409,0.062483,0.875,0.875,0.892045,1.0,1.0,0.003904
m2,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m3,10.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [37]:
descriptive_statistics(eccoar_frontend_metrics)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var
m1,11.0,0.889958,0.019513,0.85,0.875,0.888889,0.90625,0.911765,0.000381
m2,11.0,0.036681,0.006504,0.029412,0.03125,0.037037,0.041667,0.05,4.2e-05
m3,11.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [38]:
descriptive_statistics(eccoar_reports_metrics)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var
m1,6.0,0.775926,0.116198,0.7,0.7,0.738889,0.777778,1.0,0.013502
m2,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m3,6.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [39]:
descriptive_statistics(eccoar_users_metrics)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var
m1,7.0,0.761905,0.16265,0.666667,0.666667,0.666667,0.833333,1.0,0.026455
m2,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m3,7.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [40]:
descriptive_statistics(eccoar_mailer_metrics)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var
m1,8.0,0.647619,0.042592,0.6,0.6,0.666667,0.666667,0.714286,0.001814
m2,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m3,8.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [41]:
descriptive_statistics(eccoar_reports_metrics)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var
m1,6.0,0.775926,0.116198,0.7,0.7,0.738889,0.777778,1.0,0.013502
m2,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
m3,6.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [38]:
descriptive_statistics(file_component_metrics)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var
m1,26.0,0.774633,0.109821,0.588235,0.666667,0.755,0.875,0.90625,0.012061
m2,26.0,0.014387,0.018948,0.0,0.0,0.0,0.03125,0.05,0.000359
m3,26.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
