In [1]:
import pandas as pd


In [2]:
questions  = pd.read_csv('/content/questions.csv')

In [5]:
questions.head(10)

Unnamed: 0,Main Question
0,Are there information security requirements fo...
1,What is the password history before reuse?
2,What controls are in place for accessing produ...
3,Is this system developed and tested using secu...
4,What kind of AI is in use?\n- Deep Learning\n-...
5,Does the organization maintain criteria and co...
6,Does your organization ensure that no live dat...
7,Does the product include recommendations on th...
8,Has this client application/system been a part...
9,Are you a provider of health benefit services?


In [8]:
questions_column = questions.columns
print(questions_column)

Index(['Main Question'], dtype='object')


In [4]:
len(questions)

212

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from typing import List, Dict

class SecurityQuestionAnalyzer:
    def __init__(self, questions: List[str]):
        self.questions = questions
        self.vectorizer = TfidfVectorizer(stop_words='english')

        # Only proceed if enough questions
        if len(questions) > 1:
            self.tfidf_matrix = self.vectorizer.fit_transform(questions)
        else:
            raise ValueError("Need at least 2 questions for analysis")

    def find_similar_questions(self, threshold: float = 0.7) -> Dict[int, List[int]]:
        """Find similar questions based on cosine similarity"""
        similarity_matrix = cosine_similarity(self.tfidf_matrix)
        similar_questions = {}

        for i, similarities in enumerate(similarity_matrix):
            similar_indices = [
                j for j, sim in enumerate(similarities)
                if sim > threshold and i != j
            ]
            if similar_indices:
                similar_questions[i] = similar_indices

        return similar_questions

    def categorize_questions(self, n_clusters: int = 5) -> Dict[int, List[str]]:
        """Cluster questions into categories"""
        # Adjust clusters based on available questions
        max_clusters = min(n_clusters, len(self.questions) // 2)

        kmeans = KMeans(n_clusters=max_clusters, random_state=42)
        clusters = kmeans.fit_predict(self.tfidf_matrix)

        categorized_questions = {}
        for cluster in range(max_clusters):
            categorized_questions[cluster] = [
                self.questions[i] for i in range(len(self.questions))
                if clusters[i] == cluster
            ]

        return categorized_questions

    def create_search_function(self):
        """Create a search function to find relevant questions"""
        def search_questions(query: str, top_k: int = 5) -> List[str]:
            query_vector = self.vectorizer.transform([query])
            similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]
            top_indices = similarities.argsort()[-top_k:][::-1]
            return [self.questions[i] for i in top_indices]

        return search_questions

    def generate_summary(self) -> Dict[str, any]:
        """Generate summary of question characteristics"""
        # Question structure analysis
        question_starts = {
            'Are': sum(1 for q in self.questions if q.startswith('Are ')),
            'What': sum(1 for q in self.questions if q.startswith('What ')),
            'Do': sum(1 for q in self.questions if q.startswith('Do ')),
            'Is': sum(1 for q in self.questions if q.startswith('Is '))
        }

        # Common keywords extraction
        vectorizer = TfidfVectorizer(stop_words='english')
        vectorizer.fit(self.questions)
        keywords = vectorizer.get_feature_names_out()

        return {
            'total_questions': len(self.questions),
            'question_starts': question_starts,
            'top_keywords': keywords[:10]
        }

# Example usage with CSV
def analyze_security_questions(csv_path):
    # Read questions from CSV
    questions_df = pd.read_csv(csv_path)

    # Assuming questions are in a column named 'question'
    questions = questions_df['Main Question'].tolist()

    try:
        analyzer = SecurityQuestionAnalyzer(questions)

        # Find similar questions
        similar_questions = analyzer.find_similar_questions()
        print("Similar Questions:", similar_questions)

        # Categorize questions
        categories = analyzer.categorize_questions()
        print("Question Categories:", categories)

        # Create search function
        search_func = analyzer.create_search_function()
        print("Search Results:", search_func("security controls"))

        # Generate summary
        summary = analyzer.generate_summary()
        print("Analysis Summary:", summary)

        return analyzer

    except Exception as e:
        print(f"Error in analysis: {e}")
        return None

In [26]:
analyzer = analyze_security_questions('/content/questions.csv')

Similar Questions: {}
Question Categories: {0: ['Do you have a process to maintain oversight on your third parties, subcontractors, labor hire and recruitment agencies, franchisees, etc., adhering to all relevant labor laws? Please share details of the process.', 'Regarding your General Liability insurance, please provide the following: 1) name of your carrier; 2) coverage level (both per occurrence and aggregate); and 3) confirmation that the policy is paid in full and currently in force.', 'Do you provide an automated defect-tracking process for System changes and Enhancements?', "If you have a business continuity plan, on which frequency it's exercised and can you provide us with a summary of the results of the latest test?", 'Please describe the company/user data you require to provide your service.', 'Do you provide a private cloud option, please explain how isolation is achieved for the OS instances running in this private cloud with respect to other OS instances running in your 

In [27]:
question_category = analyzer.categorize_questions()

In [28]:
# After categorizing questions
categories_df = pd.DataFrame.from_dict(
    {k: pd.Series(v) for k, v in question_category.items()},
    orient='columns'
)
categories_df.columns = [f'Category_{col}' for col in categories_df.columns]

In [29]:
categories_df.head(15)

Unnamed: 0,Category_0,Category_1,Category_2,Category_3,Category_4
0,Do you have a process to maintain oversight on...,Does your organization ensure that no live dat...,Are there information security requirements fo...,What is the password history before reuse?,What kind of AI is in use?\n- Deep Learning\n-...
1,"Regarding your General Liability insurance, pl...",Do you comply with any request from the client...,What controls are in place for accessing produ...,Is this system developed and tested using secu...,Has this client application/system been a part...
2,Do you provide an automated defect-tracking pr...,"If access data is compromised, is it changed i...",Does your company have an individual or group ...,Does the organization maintain criteria and co...,Are you a provider of health benefit services?
3,"If you have a business continuity plan, on whi...",What backup media is supported and do you use ...,Which network and device security controls are...,Does the product include recommendations on th...,Do you conduct an annual assessment to determi...
4,Please describe the company/user data you requ...,Do you have data processing agreements with al...,Does the information security policy (or polic...,For which of these does your company establish...,Can the Non-Prod environment be fully integrat...
5,"Do you provide a private cloud option, please ...",What time and dates for doing backups (placed ...,Provide Security architecture landscape/diagra...,What User Interfaces are supported out-of-box?...,Does the supplier notify the entity of any vul...
6,Does the service provider ensure that its proc...,Are there firewall rules in place that manage ...,Is your organization required to undergo a Pay...,Is Ad-Hoc or batched report generation supported?,How will the use of AI be documented in the in...
7,Please provide details about the datacenter an...,Is access to client data limited only to speci...,Are the application security tests done by an ...,Is there any acceptable usage guidance in plac...,"Within the last 12 months, has the company con..."
8,Do you have an anti-virus & anti-malware solut...,Does your service delete data after a certain ...,Please provide the requirements for employees ...,Is centralized known good input validation in ...,How would you like the account management team...
9,Please provide Web Link to the Product Privacy...,What data integration options are available?,Is there a formal sign-off process in your org...,What are the authentication factors used to ac...,Regarding the aforementioned automated records...
