In [None]:
import pandas as pd
import re

# Load dataset with latin1 encoding
data = pd.read_csv('/kaggle/input/keyphrases/dataset (1).csv', encoding='latin1')

# Clean text function
def clean_text(text):
    if pd.isnull(text):
        return ""
    
    text = str(text)
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+\.(jpg|jpeg|png|gif|bmp|svg)', '', text, flags=re.IGNORECASE)  # Remove image URLs
    text = re.sub(r'http\S+', '', text)  # Remove any URL
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Remove bold markdown
    text = re.sub(r'__(.*?)__', r'\1', text)  # Remove underlined markdown
    allowed_chars_pattern = r'[^a-zA-Z0-9.,?!:;+=\-\*/()\[\]{} ]+'  # Remove unwanted characters
    text = re.sub(allowed_chars_pattern, '', text)
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    return text

# Check if 'question' and 'answer' columns exist, then clean them
if 'questions' in data.columns and 'answers' in data.columns:
    data['questions'] = data['questions'].apply(clean_text)
    data['answers'] = data['answers'].apply(clean_text)
else:
    print("The dataset does not contain 'question' and 'answer' columns.")

# Adjust pandas display options to show the entire dataset
pd.set_option('display.max_rows', None)  # Set this to None to show all rows
pd.set_option('display.max_columns', None)  # Set this to None to show all columns
pd.set_option('display.width', None)  # Avoid line breaks in output
pd.set_option('display.max_colwidth', None)  # Show full column content without truncation

# Display the DataFrame
from IPython.display import display
display(data.head())

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.util import bigrams
from nltk.tokenize import word_tokenize
import string
import pandas as pd

# Download NLTK stopwords if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Define stop words
stop_words = set(stopwords.words('english'))

# Function to clean and tokenize text
def clean_and_tokenize(text):
    # Ensure the text is a string (if it's None, NaN, or other types, return an empty list)
    if not isinstance(text, str):
        return []
    
    # Tokenize, including words with hyphen
    tokens = word_tokenize(text)
    
    # Remove stop words and punctuation, but keep words with hyphens
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words and word not in string.punctuation]
    
    return tokens

# Function to generate bigrams or fall back to unigrams, and filter out duplicate consecutive words
def generate_bigrams_or_unigrams(tokens):
    if len(tokens) < 2:
        # If less than two tokens, return unigrams (single tokens)
        return [token for token in tokens]  # Return as individual tokens
    
    # Generate bigrams
    bigram_list = list(bigrams(tokens))
    
    # Remove bigrams where both words are the same
    bigram_list = [[bigram[0], bigram[1]] for bigram in bigram_list if bigram[0] != bigram[1]]
    
    return bigram_list

# Assuming `data` is your actual DataFrame (replace this with your actual DataFrame if not defined)
# Replace the 'question' column with bigrams (or unigrams if bigrams are not possible)
data['question_tokens'] = data['questions'].apply(lambda x: generate_bigrams_or_unigrams(clean_and_tokenize(x)))

# Show the first few rows of the dataframe with the updated 'question' column (bigrams or unigrams) and the 'answer' column
from IPython.display import display
display(data[['questions','question_tokens', 'answers']].head())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(user_keyphrases, question_keyphrases):
    # Ensure input keyphrases are not empty
    if not user_keyphrases or not question_keyphrases:
        return 0  # No similarity if either list is empty

    # Join keyphrases into a single string
    user_str = ' '.join(user_keyphrases)
    question_str = ' '.join(question_keyphrases)

    # Ensure strings are not empty
    if not user_str.strip() or not question_str.strip():
        return 0  # No similarity if strings are empty

    # Compute cosine similarity
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform([user_str, question_str])
    similarity_matrix = cosine_similarity(vectors)
    return similarity_matrix[0][1]  # Return similarity score


In [None]:
# Ensure 'questions_keyphrases' is a list of tokens
data['questions_keyphrases'] = data['questions_keyphrases'].apply(
    lambda x: x if isinstance(x, list) else x.split()
)

# Remove rows where 'questions_keyphrases' is empty
data = data[data['questions_keyphrases'].apply(len) > 0]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def clean_text(text):
    """Clean text by removing non-alphabetic characters, lowercasing, and stripping extra spaces."""
    text = text.lower()  # Lowercase the text
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)  # Remove non-alphanumeric characters
    return text.strip()

def check_completeness(candidate_answer, key_phrases):
    key_text = " ".join(key_phrases)  # Combine key phrases into one string
    candidate_answer_cleaned = clean_text(candidate_answer)  # Clean candidate answer
    key_text_cleaned = clean_text(key_text)  # Clean key phrases text
    tfidf = TfidfVectorizer().fit_transform([candidate_answer_cleaned, key_text_cleaned])  # Apply TF-IDF
    similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])  # Calculate cosine similarity
    return similarity[0][0]  # Return similarity score



In [None]:
from sentence_transformers import SentenceTransformer, util

# Initialize the model once
model = SentenceTransformer('all-MiniLM-L6-v2')

def evaluate_accuracy(candidate_answer, correct_answer):
    # Check if the answers are valid strings
    if not isinstance(candidate_answer, str) or not isinstance(correct_answer, str):
        raise ValueError("Both candidate_answer and correct_answer must be strings.")
    
    # Encode both answers into embeddings
    embeddings = model.encode([candidate_answer, correct_answer], convert_to_tensor=True)
    
    # Calculate cosine similarity between the two embeddings
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return similarity.item()  # Return the similarity as a scalar value



In [None]:
!pip install textstat
import textstat

def evaluate_clarity(answer):
    score = textstat.flesch_reading_ease(answer)
    return score



In [None]:
!pip install sentence-transformers
import spacy

def evaluate_logical_flow(answer):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(answer)
    sentence_count = len(list(doc.sents))
    avg_sentence_length = sum(len(sent) for sent in doc.sents) / sentence_count
    return {"sentence_count": sentence_count, "avg_sentence_length": avg_sentence_length}


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example DataFrame (replace with your actual dataset)
# Ensure `data` has 'questions', 'answers', 'questions_keyphrases', 'answers_keyphrases' columns
# Example loading:
# data = pd.read_csv("your_dataset.csv")

# Function to compute cosine similarity
def compute_cosine_similarity(user_keyphrases, question_keyphrases):
    user_str = ' '.join(user_keyphrases)
    question_str = ' '.join(question_keyphrases)
    vectorizer = CountVectorizer().fit_transform([user_str, question_str])
    similarity_matrix = cosine_similarity(vectorizer)
    return similarity_matrix[0][1]

# Ensure questions_keyphrases are in list format
data['questions_keyphrases'] = data['questions_keyphrases'].apply(
    lambda x: x if isinstance(x, list) else x.split()
)

# User's input
user_intro = "supervised machine learning"
user_intro_keyphrases = user_intro.split()  # Replace with keyphrase extraction logic

# Compute similarities for all questions
similarities = data['questions_keyphrases'].apply(
    lambda x: compute_cosine_similarity(user_intro_keyphrases, x)
)

# Find the most similar question
most_similar_idx = similarities.idxmax()

# Evaluate depth for the corresponding answer
def evaluate_depth_with_keyphrases(answer, extracted_keyphrases):
    if not extracted_keyphrases or not isinstance(extracted_keyphrases, list):
        return {"word_count": len(answer.split()), "keyphrase_coverage": 0, "missing_keyphrases": []}
    
    answer_lower = answer.lower()
    covered_keyphrases = [kw for kw in extracted_keyphrases if kw.lower() in answer_lower]
    keyphrase_coverage = len(covered_keyphrases) / len(extracted_keyphrases) if extracted_keyphrases else 0
    missing_keyphrases = [kw for kw in extracted_keyphrases if kw.lower() not in answer_lower]
    
    return {
        "word_count": len(answer.split()),
        "keyphrase_coverage": keyphrase_coverage,
        "covered_keyphrases": covered_keyphrases,
        "missing_keyphrases": missing_keyphrases
    }




In [None]:
import language_tool_python

# Function to calculate Grammatical Accuracy
def calculate_grammatical_accuracy(text):
    """Calculate the grammatical accuracy of the text."""
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    num_errors = len(matches)
    total_words = len(text.split())
    
    if total_words == 0:
        return 1.0  # Avoid division by zero if no words
    
    error_rate = num_errors / total_words
    grammatical_accuracy = 1 - error_rate
    
    return grammatical_accuracy, num_error


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_answer(user_answer, correct_answer, keyphrases):
    """Evaluate the answer based on various metrics."""
    
    # Ensure the answers are not empty
    if not user_answer.strip() or not correct_answer.strip():
        print("One of the answers is empty. Returning default scores.")
        return {
            'accuracy': 0.0,
            'completeness': 0.0,
            'clarity': 0.0,
            'logical_flow': 0.0
        }
    
    # Calculate Accuracy using Cosine Similarity
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([user_answer, correct_answer])
    accuracy = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

    # Calculate Completeness as the percentage of key phrases mentioned
    keyphrases_count = sum(1 for kp in keyphrases if kp in user_answer.split())
    completeness = keyphrases_count / len(keyphrases) if keyphrases else 0

    # Calculate Clarity (you can add a more advanced method, for now, it's a placeholder)
    clarity = 1.0  # Assuming a perfect clarity score for simplicity

    # Logical Flow (you can implement this as needed)
    logical_flow = 1.0  # Placeholder for logical flow analysis
    
    return {
        'accuracy': accuracy,
        'completeness': completeness,
        'clarity': clarity,
        'logical_flow': logical_flow
    }


In [None]:
def normalize_score(total_weighted_grade, max_possible_score):
    """
    Normalize the total weighted grade to a 0-100 scale.

    Parameters:
    - total_weighted_grade (float): The calculated weighted grade.
    - max_possible_score (float): The maximum possible score based on weights.

    Returns:
    - float: The normalized grade on a 0-100 scale.
    """
    try:
        return (total_weighted_grade / max_possible_score) * 100
    except ZeroDivisionError:
        return 0  # Return 0 if the max_possible_score is 0


In [None]:
import random

    
# Calculate Grammatical Accuracy
#grammatical_accuracy, _ = calculate_grammatical_accuracy(answer_text)

def calculate_weighted_grade(evaluation, answer_text):
    """Calculate the weighted grade based on the evaluation metrics and grammatical accuracy."""
    try:
        accuracy_score = float(evaluation.get('accuracy', 0))  # Default to 0 if missing
        completeness_score = float(evaluation.get('completeness', 0))
        clarity_score = float(evaluation.get('clarity', 0))
        grammatical_accuracy = float(evaluation.get('grammatical_accuracy', 0))

    except Exception as e:
        print(f"Error in conversion: {e}")
        return 0  # Return a default value in case of conversion failure

    
    # Debug: Print the raw evaluation scores
    print(f"Raw Scores - Accuracy: {accuracy_score}, Completeness: {completeness_score}, "
          f"Clarity: {clarity_score}, Grammatical Accuracy: {grammatical_accuracy}")
    
    # Define weights for each metric (adjusted for grammatical accuracy)
    weights = {
        'accuracy': 0.45,   # Reduced to accommodate grammatical accuracy
        'completeness': 0.25,
        'clarity': 0.1,
        'grammatical_accuracy': 0.2  # Added grammatical accuracy with 0.2 weight
    }
    
    # Weighted score for each metric
    weighted_accuracy = accuracy_score * weights['accuracy']
    weighted_completeness = completeness_score * weights['completeness']
    weighted_clarity = clarity_score * weights['clarity']
    weighted_grammatical_accuracy = grammatical_accuracy * weights['grammatical_accuracy']
    
    # Calculate total weighted grade
    total_weighted_grade = (weighted_accuracy + weighted_completeness + weighted_clarity + weighted_grammatical_accuracy)
    
    # Debug: Print the weighted score before normalization
    print(f"Weighted Total Score: {total_weighted_grade}")
    
    # Normalize the score to a 0-100 scale (assuming max possible score is 1 for each metric)
    max_possible_score = sum(weights.values())  # 1 for each metric, based on 100% weightage
    normalized_grade = normalize_score(total_weighted_grade, max_possible_score)
    
    # Debug: Print the normalized score
    print(f"Normalized Grade: {normalized_grade}")
    
    return normalized_grade


In [None]:
def automatic_interview():
    visited = set()  # Keep track of visited questions
    evaluation_results = []  # Store evaluation metrics
    total_questions = 10
    user_intro = ""
    weighted_grades = []  # List to store final grades for each question
    
    # Ask 2-3 self-introduction questions first (but mix in variety)
    intro_questions = [
        "Tell me about yourself.",
        "What are your key strengths?",
        "What is your educational background?"
    ]
    
    random.shuffle(intro_questions)  # Shuffle the intro questions
    for i, question in enumerate(intro_questions):
        if i >= 3:  # Limit to 2-3 intro questions
            break
        print(f"Q{i+1}: {question}")
        answer = input("Your Answer: ")
        visited.add(question)  # Mark question as visited
        evaluation_results.append(evaluate_answer(answer, "", []))  # Evaluate with no keyphrases initially
        user_intro += f" {answer}"  # Build user introduction context
    
    # Tokenize user intro into keyphrases
    user_intro_keyphrases = user_intro.split()
    
    # Continue with dataset questions, ensuring variety and relatedness to previous responses
    for i in range(len(intro_questions), total_questions):
        # Compute similarities for unvisited questions
        data['similarity'] = data['questions_keyphrases'].apply(
            lambda x: compute_cosine_similarity(user_intro_keyphrases, x)
        )
        
        # Select the most similar unvisited question (avoiding repetition)
        unvisited_data = data[~data['questions'].isin(visited)]
        if unvisited_data.empty:
            print("No more unvisited questions available.")
            break
        
        most_similar_idx = unvisited_data['similarity'].idxmax()
        selected_question = data.loc[most_similar_idx, 'questions']
        selected_keyphrases = data.loc[most_similar_idx, 'answers_keyphrases']
        correct_answer = data.loc[most_similar_idx, 'answers']
        
        print(f"Q{i+1}: {selected_question}")
        answer = input("Your Answer: ")
        
        # Mark the question as visited
        visited.add(selected_question)
        
        # Evaluate the answer
        evaluation = evaluate_answer(answer, correct_answer, selected_keyphrases)
        
        # Debug: Print evaluation result (inspect the structure of the evaluation dictionary)
        print(f"Evaluation Result for Question {i+1}: {evaluation}")
        
        evaluation_results.append(evaluation)
        
        # Calculate the weighted grade and store it
        weighted_grade = calculate_weighted_grade(evaluation, answer)
        weighted_grades.append(weighted_grade)
        
        # Add answer to user context (this will help diversify next question's selection)
        user_intro += f" {answer}"
        user_intro_keyphrases = user_intro.split()  # Update user intro keyphrases
    
    print("\nInterview Completed. Evaluation Results:")
    
    # Only print weighted grades for the answers that have been evaluated
    for i, result in enumerate(evaluation_results):
        print(f"Q{i+1} - Metrics: {result}")
        if i < len(weighted_grades):  # Ensure we do not go out of range
            print(f"Q{i+1} - Weighted Grade: {weighted_grades[i]:.2f}")
    
    # Calculate the overall score (normalized to 100)
    overall_score = sum(weighted_grades) / len(weighted_grades) if weighted_grades else 0
    overall_score_normalized = normalize_score(overall_score, 100)
    print(f"\nOverall Score: {overall_score_normalized/10:.2f}")


In [None]:
# Run the automatic interview
automatic_interview()
