In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import panel as pn
pn.extension()

In [4]:
df=pd.read_csv('data_science_question_answers.csv')

In [6]:
# Preprocessing function for text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_words = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_words)

# Function to ask questions based on experience level
def ask_questions(experience):
    qa_pairs = df[df['Level'] == experience]
    preprocessed_questions = [preprocess_text(question) for question in qa_pairs['Question']]
    answers = qa_pairs['Answer'].tolist()

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_questions)

    def data_analysis_chatbot():
        user_answers = []

        for idx, question in enumerate(qa_pairs.iterrows()):
            print("Bot:", question[1]['Question'])
            user_input = input("You: ")
            user_answers.append(user_input)
        
        return user_answers, answers, tfidf_matrix, vectorizer  # Return the vectorizer too

    return data_analysis_chatbot

# Scale the similarity score based on updated criteria
def scale_similarity_score(score):
    if score > 0.75:
        scaled_score = 20
    elif score > 0.6:
        scaled_score = 15
    elif score > 0.4:
        scaled_score = 10
    elif score > 0.3:
        scaled_score = 5
    else:
        scaled_score = 0
    return scaled_score

# Example usage
print("Welcome to the Data Analysis Interview Preparation Chatbot!")
print("Please select your experience level: (entry / mid / advanced)")

while True:
    user_experience = input("Your Experience Level: ").lower()
    if user_experience in ['entry', 'mid', 'advanced']:
        break
    else:
        print("Invalid input. Please enter 'entry', 'mid', or 'advanced'.")

chatbot_function = ask_questions(user_experience)
user_responses, expected_answers, tfidf_matrix, vectorizer = chatbot_function()  # Get vectorizer from the function

print("\nAssessing your answers...\n")

for idx, response in enumerate(user_responses):
    response = preprocess_text(response)
    response_vectorized = vectorizer.transform([response])  # Use the vectorizer here

    similarities = cosine_similarity(response_vectorized, tfidf_matrix)
    closest_answer_index = similarities.argmax()

    expected_answer = expected_answers[closest_answer_index]
    similarity_score = similarities[0, closest_answer_index]

    scaled_score = scale_similarity_score(similarity_score)

    print(f"Your Answer {idx + 1}: {user_responses[idx]}")
    print(f"Expected Answer: {expected_answer}")
    print(f"Similarity Score: {similarity_score}")
    print(f"Scaled Score: {scaled_score}\n")

Welcome to the Data Analysis Interview Preparation Chatbot!
Please select your experience level: (entry / mid / advanced)
Your Experience Level: mid
Bot: Explain the importance of feature selection in machine learning.
You: 
Bot: Differentiate between correlation and causation.
You: 
Bot: What is the difference between supervised and unsupervised learning?
You: 
Bot: What is the difference between classification and regression?
You: 
Bot: What is the difference between overfitting and underfitting?
You: 
Bot: What is the difference between bias and variance?
You: 
Bot: What are the different ways to evaluate the performance of a machine learning model?
You: 

Assessing your answers...

Your Answer 1: 
Expected Answer: Feature selection is crucial as it helps in reducing overfitting, improving accuracy, and reducing training time by selecting the most relevant features.
Similarity Score: 0.0
Scaled Score: 0

Your Answer 2: 
Expected Answer: Feature selection is crucial as it helps in re

In [14]:
# Preprocessing function for text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_words = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_words)

# Function to ask questions based on experience level
def ask_questions(experience):
    qa_pairs = df[df['Level'] == experience]
    preprocessed_questions = [preprocess_text(question) for question in qa_pairs['Question']]
    answers = qa_pairs['Answer'].tolist()

    behavioral_questions = df[df['Level'] == 'behavioral'].sample(1)  # Random behavioral question
    situational_questions = df[df['Level'] == 'situational'].sample(1)  # Random situational question

    random_level_questions = qa_pairs.sample(3)  # Random 3 questions from the specified level

    selected_questions = pd.concat([random_level_questions, behavioral_questions, situational_questions])
    selected_questions = selected_questions.sample(frac=1)  # Shuffle the selected questions

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([preprocess_text(q) for q in selected_questions['Question']])

    def data_analysis_chatbot():
        user_answers = []
        scores = []

        for idx, question in enumerate(selected_questions.iterrows()):
            print("Bot:", question[1]['Question'])
            user_input = input("You: ")
            user_answers.append(user_input)

            response = preprocess_text(user_input)
            response_vectorized = vectorizer.transform([response])

            similarities = cosine_similarity(response_vectorized, tfidf_matrix)
            closest_answer_index = similarities.argmax()

            expected_answer = selected_questions.iloc[idx]['Answer']
            similarity_score = similarities[0, closest_answer_index]

            scaled_score = scale_similarity_score(similarity_score)
            scores.append(scaled_score)

        total_score = sum(scores)
        return user_answers, selected_questions['Answer'].tolist(), total_score

    return data_analysis_chatbot

# Scale the similarity score based on updated criteria
def scale_similarity_score(score):
    if score > 0.75:
        scaled_score = 20
    elif score > 0.6:
        scaled_score = 15
    elif score > 0.4:
        scaled_score = 10
    elif score > 0.3:
        scaled_score = 5
    else:
        scaled_score = 0
    return scaled_score

# Example usage
print("Welcome to the Data Analysis Interview Preparation Chatbot!")
print("Please select your experience level: (entry / mid / advanced)")

while True:
    user_experience = input("Your Experience Level: ").lower()
    if user_experience in ['entry', 'mid', 'advanced']:
        break
    else:
        print("Invalid input. Please enter 'entry', 'mid', or 'advanced'.")

chatbot_function = ask_questions(user_experience)
user_responses, expected_answers, total_score = chatbot_function()

print("\nAssessing your answers...\n")

for idx, response in enumerate(user_responses):
    print(f"Your Answer {idx + 1}: {response}")
    print(f"Expected Answer: {expected_answers[idx]}")
    print()

print(f"Total Score: {total_score}")

Welcome to the Data Analysis Interview Preparation Chatbot!
Please select your experience level: (entry / mid / advanced)
Your Experience Level: mid
Bot: What is the difference between overfitting and underfitting?
You: 
Bot: What are the different ways to evaluate the performance of a machine learning model?
You: 
Bot: Tell me about a time when you had to give a presentation.
You: 
Bot: Differentiate between correlation and causation.
You: 
Bot: You are working on a team project and one of your teammates is not pulling their weight. What do you do?
You: 

Assessing your answers...

Your Answer 1: 
Expected Answer: Overfitting is a problem that occurs when a machine learning model is too complex and learns the training data too well, resulting in poor performance on new data. Underfitting is a problem that occurs when a machine learning model is not complex enough and does not learn the training data well enough, resulting in poor performance on both training and new data.

Your Answer

In [23]:
# Preprocessing function for text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_words = [word for word in word_tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_words)

# Function to ask questions based on experience level
def ask_questions(experience):
    qa_pairs = df[df['Level'] == experience]
    preprocessed_questions = [preprocess_text(question) for question in qa_pairs['Question']]
    answers = qa_pairs['Answer'].tolist()

    behavioral_question = df[df['Level'] == 'behavioral'].sample(1)
    situational_question = df[df['Level'] == 'situational'].sample(1)

    random_level_questions = qa_pairs.sample(3)

    selected_questions = pd.concat([random_level_questions, behavioral_question, situational_question])
    selected_questions = selected_questions.sample(frac=1)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([preprocess_text(q) for q in selected_questions['Question']])

    def data_analysis_chatbot():
        user_answers = []
        scores = []

        for idx, question in enumerate(selected_questions.iterrows()):
            print("Bot:", question[1]['Question'])
            user_input = input("You: ")
            user_answers.append(user_input)

            response = preprocess_text(user_input)
            response_vectorized = vectorizer.transform([response])

            similarities = cosine_similarity(response_vectorized, tfidf_matrix)
            closest_answer_index = similarities.argmax()

            expected_answer = selected_questions.iloc[idx]['Answer']
            similarity_score = similarities[0, closest_answer_index]

            scaled_score = scale_similarity_score(similarity_score)
            scores.append(scaled_score)

        total_score = sum(scores)
        return user_answers, selected_questions['Answer'].tolist(), total_score

    return data_analysis_chatbot

# Scale the similarity score based on updated criteria
def scale_similarity_score(score):
    if score > 0.75:
        scaled_score = 20
    elif score > 0.6:
        scaled_score = 15
    elif score > 0.4:
        scaled_score = 10
    elif score > 0.3:
        scaled_score = 5
    else:
        scaled_score = 0
    return scaled_score

# Function to provide feedback based on total score
def provide_feedback(total_score):
    if total_score >= 70:
        return f"Your performance was excellent across all categories with a score of {total_score}."
    elif 50 <= total_score < 70:
        return f"You displayed a good understanding of data science concepts, with a score of {total_score}."
    else:
        return f"Your performance indicates areas that need improvement with a score of {total_score}."

# Example usage
print("Welcome to the Data Analysis Interview Preparation Chatbot!")
print("Please select your experience level: (entry / mid / advanced)")

while True:
    user_experience = input("Your Experience Level: ").lower()
    if user_experience in ['entry', 'mid', 'advanced']:
        break
    else:
        print("Invalid input. Please enter 'entry', 'mid', or 'advanced'.")

chatbot_function = ask_questions(user_experience)
user_responses, expected_answers, total_score = chatbot_function()

print("\nAssessing your answers...\n")

#for idx, response in enumerate(user_responses):
#    print(f"Your Answer {idx + 1}: {response}")
#    print(f"Expected Answer: {expected_answers[idx]}")
#    print()

#print(f"Total Score: {total_score}")

feedback = provide_feedback(total_score)
print("\nFeedback Summary:")
print(feedback)

Welcome to the Data Analysis Interview Preparation Chatbot!
Please select your experience level: (entry / mid / advanced)
Your Experience Level: mid
Bot: You are working on a team project and one of your teammates is not pulling their weight. What do you do?
You: 
Bot: Tell me about a time when you had to give a presentation.
You: 
Bot: What is the difference between bias and variance?
You: 
Bot: What is the difference between classification and regression?
You: 
Bot: What are the different ways to evaluate the performance of a machine learning model?
You: 

Assessing your answers...


Feedback Summary:
Your performance indicates areas that need improvement with a score of 0.
