<a href="https://colab.research.google.com/github/AnanyaGarg51/IBY-Repo/blob/main/IBY_Transcript_Scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EXPLORATORY DATA ANALYSIS: TRANSCRIPT_SCORES DATASET

**BASIC DATASET ANALYSIS**

In [None]:
# EDA on transcript_score dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Mounting the input CSV file
df = pd.read_csv('/content/1 (1).csv')

# Display basic information about the dataset
print("DATASET INFORMATION:")
print(df.info())

print("\n")
# Show the first few rows of the data
print("FORMAT OF DATA PRESENT IN THE DATASET:")
print(df.head())

print("\n")
# Calculate summary statistics
print("SUMMARY STATISTICS: ")
print(df.describe())

print("\n")
# Calculate the average scores for key metrics
avg_scores = df[['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic', 'speech_speed']].mean()
print("AVERAGE SCORES: ")
print(avg_scores)

print("\n")

**DISTRIBUTION OF KEY SENTIMENTS AND METRICS**

In [None]:
# Plot the distribution of key metrics
print("BOXPLOT ANALYSIS OF CANDIDATE'S SENTIMENT AND COMMUNICATION METRICS")
plt.figure(figsize=(12, 8))
sns.boxplot(data=df[['positive', 'negative', 'neutral', 'confident', 'hesitant', 'concise', 'enthusiastic']])
plt.title("Distribution of Key Metrics")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

**SENTIMENT ANALYSIS**

In [None]:
# Visualize the sentiment scores and other relevant attributes

print("SENTIMENT ANALYSIS: ")
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Plot positive sentiment
sns.histplot(df['positive'], bins=10, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Positive Sentiment')

# Plot negative sentiment
sns.histplot(df['negative'], bins=10, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Negative Sentiment')

# Plot neutral sentiment
sns.histplot(df['neutral'], bins=10, kde=True, ax=axes[0, 2])
axes[0, 2].set_title('Neutral Sentiment')

# Plot confidence
sns.histplot(df['confident'], bins=10, kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Confidence')

# Plot enthusiasm
sns.histplot(df['enthusiastic'], bins=10, kde=True, ax=axes[1, 1])
axes[1, 1].set_title('Enthusiasm')

# Plot conciseness
sns.histplot(df['concise'], bins=10, kde=True, ax=axes[1, 2])
axes[1, 2].set_title('Conciseness')

plt.tight_layout()
plt.show()

**ANALYSIS OF 'POSITIVE', 'NEGATIVE', AND 'NEUTRAL' SENTIMENTS OVER SEQUENCES OF RESPONSES**

In [None]:
# Plot sentiment values against the sequence of responses (id)
fig, ax = plt.subplots(figsize=(12, 6))

# Plot positive sentiment
sns.lineplot(x='id', y='positive', data=df, label='Positive', ax=ax)

# Plot negative sentiment
sns.lineplot(x='id', y='negative', data=df, label='Negative', ax=ax)

# Plot neutral sentiment
sns.lineplot(x='id', y='neutral', data=df, label='Neutral', ax=ax)

# Set plot title and labels
ax.set_title('Sentiment Analysis Over Sequence of Responses')
ax.set_xlabel('Response ID')
ax.set_ylabel('Sentiment Value')

# Show legend
ax.legend()

plt.show()

**ANALYSIS OF 'ENTHUSIASTIC', 'CONCISE' AND 'CONFIDENT' ATTRIBUTES**

In [None]:
# Define the attributes we're interested in
attributes = ['enthusiastic', 'concise', 'confident']

# Calculate mean and standard error for each attribute
mean_values = df[attributes].mean()
se_values = df[attributes].sem()

# Create a bar plot
plt.figure(figsize=(10, 6))
bars = plt.bar(attributes, mean_values, yerr=se_values, capsize=10, color=['#FFA07A', '#98FB98', '#87CEFA'])

# Customize the plot
plt.title('Average Levels of Key Attributes', fontsize=16)
plt.ylabel('Score (0-1 scale)', fontsize=12)
plt.ylim(0, 1)  # Set y-axis limit from 0 to 1
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add value labels on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}',
             ha='center', va='bottom')

# Add a horizontal line at 0.5 to indicate the midpoint
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.5)
plt.text(2.1, 0.51, 'Midpoint', color='r', va='bottom', ha='left')

plt.tight_layout()
plt.show()

# Print the exact values for verification
print("Mean Values:")
print(mean_values)
print("\
Standard Errors:")
print(se_values)

print("\
Done")

**SPEECH SPEED ANALYSIS**

In [None]:
# Calculate the average speech speed
average_speech_speed = df['speech_speed'].mean()

# Plot speech speed over time
plt.figure(figsize=(12, 6))
plt.plot(df['start'], df['speech_speed'], label='Speech Speed')
plt.axhline(y=average_speech_speed, color='r', linestyle='--', label=f'Avg Speech Speed: {average_speech_speed:.2f}')
plt.title("Speech Speed Over Time")
plt.xlabel("Time (seconds)")
plt.ylabel("Speech Speed")
plt.legend()
plt.tight_layout()
plt.show()

average_speech_speed_rounded = round(average_speech_speed, 2)
print("Candidate's Average Speech Speed: ")
average_speech_speed_rounded