In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# Load the data
emotion_data = pd.read_csv('emotion_6.csv')
transcript_data = pd.read_csv('transcriptscores_6.csv',delimiter = ',')
transcript_text = pd.read_csv('transcripttext_6.txt', delimiter='\t', header=None, names=['text'])
gaze_data = pd.read_csv('gaze_6.csv')
metadata = pd.read_csv('metadata_6.csv')
# Load transcript text
with open('transcripttext_6.txt', 'r') as file:
    transcript_text1= file.read()

# Combine transcript data
transcript_data['text'] = transcript_text['text']

def analyze_emotions(emotion_data):
    emotion_summary = emotion_data[['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']].mean()
    dominant_emotion = emotion_summary.idxmax()
    
    emotion_variance = emotion_data[['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']].var()
    emotional_range = emotion_variance.sum()
    
    return emotion_summary, dominant_emotion, emotional_range

def analyze_speech(transcript_data):
    speech_speed = transcript_data['speech_speed'].mean()
    confidence = transcript_data['confident'].mean()
    conciseness = transcript_data['concise'].mean()
    enthusiasm = transcript_data['enthusiastic'].mean()
    
    return speech_speed, confidence, conciseness, enthusiasm

def analyze_sentiment(transcript_data):
    sentiment_scores = transcript_data[['positive', 'negative', 'neutral']].mean()
    overall_sentiment = sentiment_scores.idxmax()
    
    return sentiment_scores, overall_sentiment

def analyze_gaze(gaze_data):
    gaze_percentage = (gaze_data['gaze'] == 1).mean() * 100
    blink_rate = gaze_data['blink'].mean()
    avg_eye_offset = gaze_data['eye_offset'].abs().mean()
    
    return gaze_percentage, blink_rate, avg_eye_offset

def analyze_metadata(metadata):
    total_duration = metadata['elapsed_time'].max()
    avg_distance = metadata['distance'].mean()
    
    return total_duration, avg_distance

def generate_insights(emotion_data, transcript_data, gaze_data, metadata):
    emotion_summary, dominant_emotion, emotional_range = analyze_emotions(emotion_data)
    speech_speed, confidence, conciseness, enthusiasm = analyze_speech(transcript_data)
    sentiment_scores, overall_sentiment = analyze_sentiment(transcript_data)
    gaze_percentage, blink_rate, avg_eye_offset = analyze_gaze(gaze_data)
    total_duration, avg_distance = analyze_metadata(metadata)
    
    insights = {
        "Dominant Emotion": dominant_emotion,
        "Emotional Range": emotional_range,
        "Average Speech Speed": speech_speed,
        "Average Confidence": confidence,
        "Average Conciseness": conciseness,
        "Average Enthusiasm": enthusiasm,
        "Overall Sentiment": overall_sentiment,
        "Gaze Percentage": gaze_percentage,
        "Blink Rate": blink_rate,
        "Average Eye Offset": avg_eye_offset,
        "Total Duration": total_duration,
        "Average Distance": avg_distance
    }
    
    return insights

def plot_emotion_timeline(emotion_data):
    plt.figure(figsize=(12, 6))
    for emotion in ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']:
        plt.plot(emotion_data['image_seq'], emotion_data[emotion], label=emotion)
    
    plt.title("Emotion Timeline")
    plt.xlabel("Video Timeline")
    plt.ylabel("Emotion Intensity")
    plt.legend()
    plt.savefig('emotion_timeline.png')
    plt.close()

def plot_speech_characteristics(transcript_data):
    characteristics = ['confident', 'concise', 'enthusiastic', 'speech_speed']
    
    fig, axs = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle("Speech Characteristics Over Time")
    
    for i, characteristic in enumerate(characteristics):
        ax = axs[i // 2, i % 2]
        ax.plot(transcript_data['start'], transcript_data[characteristic])
        ax.set_title(characteristic.capitalize())
        ax.set_xlabel("Time (seconds)")
        ax.set_ylabel("Score")
    
    plt.tight_layout()
    plt.savefig('speech_characteristics.png')
    plt.close()

def plot_gaze_analysis(gaze_data):
    plt.figure(figsize=(12, 6))
    plt.plot(gaze_data['image_seq'], gaze_data['gaze'], label='Gaze')
    plt.plot(gaze_data['image_seq'], gaze_data['eye_offset'], label='Eye Offset')
    plt.title("Gaze and Eye Offset Over Time")
    plt.xlabel("Video Timeline")
    plt.ylabel("Value")
    plt.legend()
    plt.savefig('gaze_analysis.png')
    plt.close()

def get_word_frequency(text, top_n=50):
    words = re.findall(r'\w+', text.lower())
    return Counter(words).most_common(top_n)

def identify_expertise_areas(text, keywords):
    text_lower = text.lower()
    expertise_scores = {}
    for area, words in keywords.items():
        score = sum(text_lower.count(word) for word in words)
        expertise_scores[area] = score
    return expertise_scores

# Generate insights
insights = generate_insights(emotion_data, transcript_data, gaze_data, metadata)

# Generate plots
plot_emotion_timeline(emotion_data)
plot_speech_characteristics(transcript_data)
plot_gaze_analysis(gaze_data)

# Print insights
print("Candidate Analysis Insights:")
for key, value in insights.items():
    print(f"{key}: {value}")

print("\nPlots 'emotion_timeline.png', 'speech_characteristics.png', and 'gaze_analysis.png' have been generated.")
# Word frequency analysis

# Analyze transcript content
word_count = len(transcript_text1.split())
print(f"Word Count: {word_count}")

# List of stop words to remove
stop_words = ['i', 'and', 'to', 'the', 'a', 'in', 'of', 'is', 'this', 'my','you','at','am','an','have','be']

# Assuming `get_word_frequency` returns a list of tuples (word, count)
word_freq = get_word_frequency(transcript_text['text'].str.cat(sep=' '))

# Filter and print words that are not in stop_words
for word, count in word_freq:
    if word.lower() not in stop_words:  # Convert to lowercase for case-insensitive comparison
        print(f"{word}: {count}")


# Expertise areas based on word frequency
expertise_keywords = {
    "Regulatory Affairs": ["regulatory", "affairs", "pharmaceutical"],
    "Medical Writing": ["medical", "writer", "writing"],
    "Drug Safety": ["drug", "safety", "risk", "management"],
    "Research": ["research", "work", "patent", "publication"],
    "Biotechnology": ["biotechnology", "tech"],
    "Management": ["management", "postgraduate","business"]
}

expertise_areas = identify_expertise_areas(transcript_text['text'].str.cat(sep=' '), expertise_keywords)
print("\nExpertise areas based on keyword frequency:")
for area, score in sorted(expertise_areas.items(), key=lambda x: x[1], reverse=True):
    print(f"{area}: {score}")

Candidate Analysis Insights:
Dominant Emotion: neutral
Emotional Range: 363.7099264544094
Average Speech Speed: 2.583163463330654
Average Confidence: 0.6797547514239947
Average Conciseness: 0.36779167668686974
Average Enthusiasm: 0.48143330671721035
Overall Sentiment: positive
Gaze Percentage: 100.0
Blink Rate: 0.0
Average Eye Offset: 1.7071928571428572
Total Duration: 19.0
Average Distance: 0.0

Plots 'emotion_timeline.png', 'speech_characteristics.png', and 'gaze_analysis.png' have been generated.
Word Count: 226
for: 8
analytics: 6
college: 4
mba: 3
from: 2
iim: 2
experience: 2
deloitte: 2
end: 2
also: 2
graduation: 2
has: 2
me: 2
media: 2
love: 2
towards: 2
hi: 1
name: 1
nathan: 1
lewis: 1
m: 1
first: 1
year: 1
student: 1
kashipur: 1
having: 1
consulting: 1
three: 1
years: 1
after: 1
graduating: 1
engineering: 1
paved: 1
way: 1
degree: 1
quench: 1
thirst: 1
exploration: 1

Expertise areas based on keyword frequency:
Regulatory Affairs: 1
Medical Writing: 1
Research: 1
Management: 1