In [0]:
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
path = '/List_of_Services.txt'
#File containing list of services hit by each individual test case in each line
Tr_file = open(path).readlines()

#List of services without '\n'
Tr_clean = []

for line in Tr_file:
    if line.strip()!='':
        Tr_clean.append(line.strip())

Tr_clean

['cbs-account-service cbs-address-service cbs-appointments-service cbs-assurance-service cbs-assurance-view-web cbs-assurance-web cbs-aurora-web',
 'cbs-cas-authentication-web cbs-case-management-service cbs-case-management-batch cbs-case-management-web cbs-catalogue-evaluation-service cbs-customer-history-service',
 'cbs-customer-management-web cbs-customer-service cbs-customer-timeline-web cbs-customer-web cbs-data-quality-service',
 'cbs-delivery-fulfilment-batch cbs-delivery-fulfilment-service cbs-device-service cbs-engineer-batch cbs-engineer-gateway-service cbs-engineer-service',
 'cbs-hss-portal-web cbs-inventory-service cbs-inventory-web cbs-loyalty-service cbs-order-management-batch cbs-order-management-service cbs-order-management-web',
 'cbs-payments-service cbs-payments-web cbs-properties-service cbs-refdata-service cbs-sales-batch cbs-sales-dialogue cbs-sales-interaction-service cbs-sales-web',
 'cbs-scms-gateway-batch cbs-scms-gateway-service cbs-spell-checker-web cbs-sto

In [0]:
df = pd.DataFrame(Tr_clean, columns=['Services'])
df['Text Case'] = [i for i in range(len(df))]

#Vectorising the bunch of cleaned lines of service names with n-gram count vectoriser(n=5) to check
#for a match for upto 5 services
n_vect = CountVectorizer(ngram_range=(1,5))
Tr_fit = n_vect.fit(df['Services'])
Tr_transformed = Tr_fit.transform(df['Services'])

#Following list contains a string of service names modified during latest depolyment
modified_services = ['cbs-account-service cbs-assurance-service cbs-customer-management-web cbs-payments-service cbs-transactional-comms-batch cbs-assurance-service']

#Vectorisation of modified_services
modified_transformed = Tr_fit.transform(modified_services)

#Our objective is to find out similarity score between this list of affected 
# services and each test case used in training set
similarity_score = []
for i in Tr_transformed:
    similarity_score.append(cosine_similarity(i, modified_transformed))
print('Similarity scores:',similarity_score)
#Modifying df dataframe to show similarity score for each test case
df['Similarity_score'] = [i[0][0] for i in similarity_score]

#Pull the top n test cases with high similarity score for impact regression
sorted_df = df.sort_values('Similarity_score', ascending = False)

#Fetching the top 5 test cases for impact regression
top_TC = sorted_df.iloc[:5,:]

Similarity scores: [array([[0.66397743]]), array([[0.44552106]]), array([[0.48556162]]), array([[0.43934968]]), array([[0.44607461]]), array([[0.52212212]]), array([[0.4324175]]), array([[0.52478975]]), array([[0.15625]]), array([[0.58846432]]), array([[0.53957256]]), array([[0.45077415]]), array([[0.4275814]]), array([[0.5463483]]), array([[0.27608814]]), array([[0.45287646]]), array([[0.54648931]]), array([[0.34020691]])]


In [0]:
#MEASURING TEST COVERAGE AND IDENTIFYING SERVICES NOT COVERED

In [0]:
#Remaining df
remaining_df = sorted_df.iloc[6:,:]

#Services covered by selected test cases
services = []
for i in top_TC['Services']:
    tc_services = i.split()
    for j in tc_services:
        services.append(j)
        
#list of distinct impacted services covered by selected test cases for impact regression
distinct_services = set(services)
distinct_services

{'cbs-account-service',
 'cbs-address-service',
 'cbs-appointments-service',
 'cbs-assurance-service',
 'cbs-assurance-view-web',
 'cbs-assurance-web',
 'cbs-aurora-web',
 'cbs-cas-authentication-web',
 'cbs-case-management-batch',
 'cbs-customer-history-service',
 'cbs-customer-management-web',
 'cbs-delivery-fulfilment-service',
 'cbs-inventory-service',
 'cbs-stock-allocation-service',
 'cbs-transactional-comms-batch',
 'cbs-usage-service'}

In [0]:
#service not covered
uncovered = [service for service in modified_services[0].split(' ') if service not in distinct_services]
uncovered

['cbs-payments-service']

In [0]:
#Funtion to return test cases containing uncovered services
def additional_tc(uncovered, remaining_df):
    tc_sr = []
    for service in uncovered:
        count = 0
        for line in remaining_df['Services']:
            if service not in line:
                count = count + 1
            elif service in line:
                tc_sr.append(count)
                break
    
    for i in tc_sr:
        print(remaining_df.iloc[i][0])
        
additional_tc(uncovered, remaining_df)


cbs-payments-service cbs-payments-web cbs-properties-service cbs-refdata-service cbs-sales-batch cbs-sales-dialogue cbs-sales-interaction-service cbs-sales-web


In [0]:
remaining_df

Unnamed: 0,Services,Text Case,Similarity_score
5,cbs-payments-service cbs-payments-web cbs-prop...,5,0.522122
2,cbs-customer-management-web cbs-customer-servi...,2,0.485562
15,cbs-delivery-fulfilment-batch cbs-inventory-we...,15,0.452876
11,cbs-hss-portal-web cbs-inventory-web cbs-trans...,11,0.450774
4,cbs-hss-portal-web cbs-inventory-service cbs-i...,4,0.446075
1,cbs-cas-authentication-web cbs-case-management...,1,0.445521
3,cbs-delivery-fulfilment-batch cbs-delivery-ful...,3,0.43935
6,cbs-scms-gateway-batch cbs-scms-gateway-servic...,6,0.432417
12,cbs-customer-timeline-web cbs-inventory-servic...,12,0.427581
17,cbs-validation-service cbs-viewing-card-manage...,17,0.340207
