# MGL869 - Lab

*MGL869 ETS Montreal - Production engineering*

## Abstract

## Authors
- **Léo FORNOFF**
- **William PHAN**
- **Yannis OUAKRIM**

---

## Part 1 : Data collection

In [None]:
from Jira import jira_download
from pandas import Index
from numpy import ndarray


### 1.1 - Download Jira data
We download data if they are not already present in the data folder.

Return the dataframe of the data.

Query filter can be defined in config.ini

In [None]:
jira_dataframe = jira_download()

### 1.2 - Clean Jira data using pandas
Previously, we downloaded all the data from Jira. Now, we will clean the data using pandas.
We will keep only some colums and combine some columns.

In [None]:
keep: [str] = ['Issue key', 'Status', 'Resolution', 'Created', 'Fix Versions Combined', 'Affects Versions Combined']

In [None]:
affects_version_columns: [str] = [col for col in jira_dataframe.columns if col.startswith('Affects Version/s')]
jira_dataframe['Affects Versions Combined'] = jira_dataframe[affects_version_columns].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1
)

In [None]:
# Combine the versions into a single column
fix_version_columns: [str] = [col for col in jira_dataframe.columns if col.startswith('Fix Version/s')]

jira_dataframe['Fix Versions Combined'] = jira_dataframe[fix_version_columns].apply(
    lambda x: ', '.join(x.dropna().astype(str)), axis=1
)
jira_dataframe = jira_dataframe.loc[:, keep]

In [None]:
# Identify columns whose names contain the string 'Issue key'
issue_key_columns: Index = jira_dataframe.columns[jira_dataframe.columns.str.contains('Issue key')]
# Extract the values from these columns as a NumPy array
issue_key_values: ndarray = jira_dataframe[issue_key_columns].values
# Flatten the array to create a one-dimensional list of all 'Issue key' values
flattened_issue_keys: ndarray = issue_key_values.flatten()
# Convert the list into a set to remove duplicates
ids: set = set(flattened_issue_keys)

---


## Part 2 : Repository analysis


In [None]:
from Hive import git_download, commit_analysis, update_commit_dataframe, filter_versions_by_min
from git import Repo, Tag
from pandas import DataFrame
from configparser import ConfigParser
from re import compile
from packaging import version  

### 2.1 - Clone repository

In [None]:
repo: Repo = git_download()

In [None]:
all_couples = commit_analysis(ids)

### 2.2 - Filter data

In [None]:
commit_dataframe: DataFrame = DataFrame(all_couples, columns=["Issue key", "File", "Commit"])

In [None]:
# Languages without whitespaces
config: ConfigParser = ConfigParser()
config.read("config.ini")
languages: [str] = config["GENERAL"]["Languages"].split(",")
languages: [str] = [lang.strip() for lang in languages]
commit_dataframe: DataFrame = commit_dataframe[commit_dataframe['File'].str.endswith(tuple(languages))]

In [None]:
couples = update_commit_dataframe(commit_dataframe, jira_dataframe)
couples

### 2.3 - Extract filter versions from git

In [None]:
releases_regex: [str] = config["GIT"]["ReleasesRegex"].split(",")
tags: Tag = repo.tags
versions: dict = {tag.name: tag.commit for tag in tags}
releases_regex: [str] = [regex.strip() for regex in releases_regex]
releases_regex = [compile(regex) for regex in releases_regex]

In [None]:
filtered_versions = filter_versions_by_min(versions, releases_regex, "2.0.0")

## Part 3. - Understand analysis

In [None]:
from Understand.commands import und_create_command, und_purge_command
from Understand.metrics import metrics
from Understand.label import label_all_metrics
from os import path
from Understand import merge_static_metrics
from Understand.enrich import enrich_metrics
from Understand.update import merge_all_metrics

### 3.1 - Create the Understand project


In [None]:
hive_git_directory: str = config["GIT"]["HiveGitDirectory"]
data_directory: str = config["GENERAL"]["DataDirectory"]
understand_project_name : str = config["UNDERSTAND"]["UnderstandProjectName"]

understand_project_path : str = path.join(data_directory, hive_git_directory, understand_project_name)

if not path.exists(understand_project_path):
    und_create_command()

In [None]:
und_purge_command()

### 3.2 - Metrics extraction


In [None]:
metrics(filtered_versions)

### 3.3 - Labeling


In [None]:
label_all_metrics(couples)

In [None]:
enrich_metrics(couples)

In [None]:
v = [
    "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.2.0", "2.3.0", "2.3.1", "2.3.2",
    "2.3.3", "2.3.4", "2.3.5", "2.3.6", "2.3.7", "2.3.8", "2.3.9", "2.3.10",
    "3.0.0", "3.1.0", "3.1.1", "3.1.2", "3.1.3", "4.0.0", "4.0.1"
]
merge_all_metrics(v)

In [None]:
merge_static_metrics()

In [None]:
from AI import run_pipeline
import os
from configparser import ConfigParser
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
config: ConfigParser = ConfigParser()
config.read("config.ini")

In [None]:
data_directory = config["GENERAL"]["DataDirectory"]
output_dir = config["UNDERSTAND"]["FullStaticMetricsOutputDirectory"]
file_name = config["UNDERSTAND"]["MergedStaticMetricsFileName"]
file_path = os.path.join(data_directory, output_dir, file_name)

In [None]:
# Logistic Regression with solver and normalization
metrics_logistic_balanced = run_pipeline(
    file_path=file_path,
    model=lambda: LogisticRegression(max_iter=1000, class_weight="balanced"),
    config_section="VERSION_ALL_LAB"
)

# Run pipeline for all_versions with Random Forest
metrics_rf_balanced = run_pipeline(
    file_path=file_path,
    model=lambda: RandomForestClassifier(random_state=42, class_weight='balanced'),
    config_section="VERSION_ALL_LAB"
)

In [None]:
metrics_logistic_balanced

In [None]:
metrics_rf_balanced

## Part 5. - Dynamic Metrics

In [None]:
from Dynamic import convert_json_to_csv, merge_static_and_dynamic_csv, build_dependencies, display_hierarchy, collect_dynamic_metrics_v2
from Hive import filter_versions_by_min

In [None]:
all_versions = filter_versions_by_min(versions, releases_regex,'1.0')
version_json = build_dependencies(all_versions)
display_hierarchy(version_json)

version_json

In [None]:
dynamic_metrics = collect_dynamic_metrics_v2(version_json)

In [None]:
convert_json_to_csv()

In [None]:
merge_static_and_dynamic_csv()