### The purpose of the code is to validate a list of projects under several circumstances, primarily those that arise from human errors:
 1. Duplicated projects: a unique ID may have been used multiple times, which could lead to issues in future.
 2. Missing projects: an ID may have been skipped, resulting in errors in the project sequence.
 3. Specific Type of Projects: the code also checks for projects indicated by specific strings in the project name.

#### Duplicates check:

In [10]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 100)

ho_num = pd.read_excel('../summary_projectnames.xls', header=2)
ho_num = ho_num.applymap(lambda x: x.strip() if isinstance(x, str) else x)

summary = ho_num[['Summary']]
summary = summary.loc[summary['Summary'].str.startswith('Project-HO', na=False)]

split = summary['Summary'].str.split("|", expand=True)

proj_ho = split[[0]].rename(columns={0: 'Project-HO'})
duplicates = proj_ho[proj_ho.duplicated(['Project-HO'], keep=False)]

# another way of duplicates
# duplicates = pd.concat(g for _, g in proj_ho.groupby("Project-HO") if len(g) > 1)   

duplicates.reset_index(inplace=True, drop=True)
duplicates['Project-HO'] = duplicates[~duplicates['Project-HO'].str.startswith('Project-HOX')]
duplicates['Duplicated_projects'] = duplicates['Project-HO'].drop_duplicates()
duplicates[['Duplicated_projects']].apply(lambda x: x.str[6:]).dropna()

Unnamed: 0,Duplicated_projects
0,5252
2,5254


#### Missing projects:

In [11]:
proj_ho_sequence = proj_ho
proj_ho_sequence['Project-HO'] = (proj_ho['Project-HO']
                             .str[6:]
                             .fillna(np.nan)  
                             .str.replace('X', '')
                             .replace('', np.nan)  
                             .fillna(0)    
                             .astype(int))

min_val = int(proj_ho_sequence['Project-HO'].min())
max_val = int(proj_ho_sequence['Project-HO'].max())

# create a sequence of all integers between the minimum and maximum values
all_integers = set(range(min_val, max_val + 1))

# find the missing int in the column
missing_int = all_integers - set(proj_ho_sequence['Project-HO'])
missing_int_list = sorted(missing_int)

result = pd.DataFrame({'Missing_from_list': missing_int_list})
result

Unnamed: 0,Missing_from_list
0,5265


#### Check on specific type of projects

In [12]:
lookup = [
    'Trigger_value1', 'Trigger_value2', 'Trigger_value3', 'Trigger_value4'
]
single_str = '(' + '|'.join(lookup) + ')'  # creates a single string and encloses it in parentheses for group matching

summary['On_reuse'] = summary.loc[summary['Summary'].str.contains(single_str)]['Summary'].str[6:10]
summary['Keyword'] = summary['Summary'].str.extract(single_str, expand=False)

summary[['On_reuse', 'Keyword']].dropna().reset_index(drop=True)

1,On_reuse,Keyword
0,5266,Trigger_value2
1,5267,Trigger_value1
2,5283,Trigger_value3
3,5300,Trigger_value1
