# Part 3: Data Analysis and Visualization

**Name:** Brayden Uglione

**Date:** 10/10/24

**Exercise:** Project #1, Data Analysis and Visualization

**Purpose:** To analyze survey data from non-computing majors and provide insights that can help increase the number of students taking computing classes at CCM.

## Import Libraries and Load Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the cleaned survey results
df = pd.read_csv('cleaned_survey_results.csv')

## Question 1: Which courses are students enrolled in?

The code counts and filters student enrollments in specific courses, visualizes the distribution with a bar chart, and prints the enrollment counts alongside course names.

In [None]:
# Display the distribution of the courses students are enrolled in
course_counts = df['which_course_are_you_currently_enrolled_in'].value_counts()

# Map course codes to course names
course_key = {
    'CMP 135': 'Computer Concepts with Applications',
    'CMP 101': 'Computer Information Literacy',
    'CMP 126': 'Computer Technology and Applications',
}

# Filter out courses not in the course key
course_counts = course_counts[course_counts.index.isin(course_key.keys())]

# Plot the distribution of enrolled courses
plt.figure(figsize=(12, 6))
course_counts.plot(kind='bar', color='violet')
plt.title('Distribution of Enrolled Courses')
plt.xlabel('Course')
plt.ylabel('Number of Students')
plt.xticks(rotation=0, ha='center')
plt.tight_layout()
plt.show()

# Display the distribution of enrolled courses
print("Course Enrollment:")
for course, count in course_counts.items():
    print(f"{course}: {count} - {course_key[course]}")

## Question 2: What percentage of students heard about CCM through each marketing channel?

The code identifies marketing channel columns, calculates the percentage of positive responses for each channel, formats channel names, creates a bar chart to visualize the results, and prints the effectiveness percentages for each marketing channel.

In [None]:
# Calculate the percentage of students who heard about CCM through each marketing channel
def calculate_percentage(series):
    valid_responses = series.dropna()
    return (valid_responses == 'Yes').sum() / len(valid_responses) * 100 if len(valid_responses) > 0 else 0

# Extract the marketing channel columns
marketing_channels = []
for col in df.columns:
    if col.startswith('how_did_you_hear_about_county_college_of_morris_') and 'check_all_that_apply' not in col:
        marketing_channels.append(col)

# Use calculate function to find the percentage of students who heard about CCM
channel_percentages = df[marketing_channels].apply(calculate_percentage).sort_values(ascending=True)

# Format the columns for plotting
formatted_labels = []
for channel in channel_percentages.index:
    formatted_labels.append(channel.replace('how_did_you_hear_about_county_college_of_morris_', '').replace('_', ' ').title())

# Plot the percentage of students who heard about CCM
plt.figure(figsize=(12, 8))
bars = plt.barh(range(len(formatted_labels)), channel_percentages.values)

# Color bars based on significance (assuming 5% as the threshold for significance)
for i, bar in enumerate(bars):
    if channel_percentages.values[i] < 35.435:
        bar.set_color('#D3D3D3')  # Light grey for non-significant
    else:
        bar.set_color('#4682B4')  # Steel blue for significant

plt.title('Percentage of Students Who Heard About CCM Through Each Channel', fontsize=18)
plt.xlabel('Percentage of Students', fontsize=16)
plt.ylabel('Marketing Channel', fontsize=16)
plt.yticks(range(len(formatted_labels)), formatted_labels, fontsize=12)
plt.xticks(fontsize=12)
plt.tight_layout()
plt.show()

# Display the percentage of students who heard about CCM
print("Marketing Channel Effectiveness:")
for label, percentage in zip(formatted_labels, channel_percentages):
    print(f"{label}: {percentage:.2f}%")

## Question 3: Which prior computing experiences were most influential in students' decisions to enroll?

This code analyzes and visualizes students' prior computing experiences before applying to County College of Morris.

In [None]:
# Calculate the percentage of students who participated in each prior computing experience
experience_columns = []
for col in df.columns:
    if col.startswith('prior_to_applying_to_college_did_you_participate_in_any_of_the_following_events_or_activities_at_the_county_college_of_morris_andor_with_the_department_of_information_technologies_if_at_all_'):
        experience_columns.append(col)

# Use function to find the percentage of students who participated
experience_percentages = df[experience_columns].apply(calculate_percentage).sort_values(ascending=False)

# Format the columns for plotting
formatted_labels = []
for col in experience_percentages.index:
    label = col.split('_')[-1].replace('_', ' ').title()
    if label == '':
        label = 'Other'
    formatted_labels.append(label)

# Plot the percentage of students who participated
plt.figure(figsize=(12, 6))
plt.bar(range(len(formatted_labels)), experience_percentages.values, color='lightgreen')
plt.title('Prior Computing Experiences of Students')
plt.xlabel('Experience Type')
plt.ylabel('Percentage of Students (Yes Responses)')
plt.xticks(range(len(formatted_labels)), formatted_labels, rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Display the percentage of students who participated
print("Prior Computing Experiences:")
for label, percentage in zip(formatted_labels, experience_percentages):
    print(f"{label}: {percentage:.2f}%")

## Question 4: Is there a correlation between gender and interest level in taking more computing courses?

This code analyzes and visualizes the relationship between gender and interest level in taking more computing courses at County College of Morris.

In [None]:
# Identify columns related to interest in computing courses
interest_column = 'on_a_scale_of_1_to_5_with_1_being_not_at_all_interested_and_5_being_extremely_interested_how_interested_are_you_taking_more_courses_in_computer_science_information_technology_or_game_development'
df['interest_level'] = pd.to_numeric(df[interest_column], errors='coerce')
df = df[(df['interest_level'] >= 1) & (df['interest_level'] <= 5) & (df['gender'].notna()) & (df['gender'] != 'Unknown')]

# Create a box plot to visualize the distribution of interest
contingency_table = pd.crosstab(df['gender'], df['interest_level'])
contingency_table_percentage = contingency_table.div(contingency_table.sum(axis=1), axis=0) * 100

# Create a stacked bar plot to visualize the interest in computing
plt.figure(figsize=(10, 6))
contingency_table_percentage.plot(kind='bar', stacked=True)
plt.title('Interest in Taking More Computing Courses by Gender')
plt.xlabel('Gender')
plt.ylabel('Percentage')
plt.legend(title='Interest Level', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# Print the contingency table percentages and average interest level
print("Contingency Table Percentages:")
print(contingency_table_percentage)

# Calculate the average interest level
average_interest = df.groupby('gender')['interest_level'].mean()
print("\nAverage Interest Level by Gender:")
print(average_interest)