# Part 3: Data Analysis and Visualization

**Name:** Brayden Uglione

**Date:** 10/10/24

**Exercise:** Project #1, Data Analysis and Visualization

**Purpose:** To analyze survey data from non-computing majors and provide insights that can help increase the number of students taking computing classes at CCM.

## Import Libraries and Load Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the cleaned data
df = pd.read_csv('cleaned_non_majors_survey_results.csv')

## Question 1: Which courses are students enrolled in?

The code counts and filters student enrollments in specific courses, visualizes the distribution with a bar chart, and prints the enrollment counts alongside course names.

In [None]:
# Count the number of students enrolled in each course
course_counts = df['which_course_are_you_currently_enrolled_in'].value_counts()

# Define a dictionary to map course codes to their full names
course_key = {
    'CMP 135': 'Computer Concepts with Applications',
    'CMP 101': 'Computer Information Literacy',
    'CMP 126': 'Computer Technology and Applications',
}

# Filter the course counts to include only the courses in the course_key dictionary
course_counts = course_counts[course_counts.index.isin(course_key.keys())]

# Create a bar plot to visualize the distribution of enrolled courses
plt.figure(figsize=(12, 6))
course_counts.plot(kind='bar', color='violet')
plt.title('Distribution of Enrolled Courses')
plt.xlabel('Course')
plt.ylabel('Number of Students')
plt.xticks(rotation=0, ha='center')
plt.show()

# Print the enrollment counts for each course along with their full names
print("Course Enrollment:")
for course, count in course_counts.items():
    print(f"{course}: {count} - {course_key[course]}")

## Question 2: What percentage of students heard about CCM through each marketing channel?

The code identifies marketing channel columns, calculates the percentage of positive responses for each channel, formats channel names, creates a bar chart to visualize the results, and prints the effectiveness percentages for each marketing channel.

In [None]:
# Function to calculate the percentage of 'Yes' responses
def calculate_percentage(series):
    # Filter out responses that are not 'Yes' or 'No'
    valid_responses = series[series.isin(['Yes', 'No'])]
    # Calculate percentage if there are valid responses, otherwise return 0
    return (valid_responses == 'Yes').sum() / len(valid_responses) * 100 if len(valid_responses) > 0 else 0

# Function to format channel names for better readability
def format_channel_name(channel):
    return channel.replace('how_did_you_hear_about_county_college_of_morris_', '').replace('_', ' ').title()

# Identify columns related to marketing channels
marketing_channels = []
for col in df.columns:
    if col.startswith('how_did_you_hear_about_county_college_of_morris_') and 'check_all_that_apply' not in col:
        marketing_channels.append(col)

# Calculate percentage of students who heard about CCM through each channel
channel_percentages = df[marketing_channels].apply(calculate_percentage)
channel_percentages_sorted = channel_percentages.sort_values(ascending=False)

# Format channel names for display
formatted_labels = []
for channel in channel_percentages_sorted.index:
    formatted_labels.append(format_channel_name(channel))

# Create a bar plot to visualize the effectiveness of each marketing channel
plt.figure(figsize=(12, 6))
plt.bar(range(len(formatted_labels)), channel_percentages_sorted.values, color='skyblue')
plt.title('Percentage of Students Who Heard About CCM Through Each Channel')
plt.xlabel('Marketing Channel')
plt.ylabel('Percentage of Students')
plt.xticks(range(len(formatted_labels)), formatted_labels, rotation=90, ha='center')
plt.tight_layout()
plt.show()

# Print the effectiveness of each marketing channel
print("Marketing Channel Effectiveness:")
for label, percentage in zip(formatted_labels, channel_percentages_sorted):
    print(f"{label}: {percentage:.2f}%")

## Question 3: Which prior computing experiences were most influential in students' decisions to enroll?

This code analyzes and visualizes students' prior computing experiences before applying to County College of Morris.

In [None]:
# Function to calculate the percentage of 'Yes' responses
def calculate_yes_percentage(series):
    # Filter out responses that are not 'Yes' or 'No'
    valid_responses = series[series.isin(['Yes', 'No'])]
    yes_count = (valid_responses == 'Yes').sum()
    total_valid = len(valid_responses)
    # Calculate percentage if there are valid responses, otherwise return 0
    return (yes_count / total_valid * 100) if total_valid > 0 else 0

# Function to format column names for better readability
def format_column_name(col):
    # Remove the long prefix from the column name
    prefix = 'prior_to_applying_to_college_did_you_participate_in_any_of_the_following_events_or_activities_at_the_county_college_of_morris_andor_with_the_department_of_information_technologies_if_at_all_'
    col = col.replace(prefix, '')
    # Replace underscores with spaces and capitalize each word
    return col.replace('_', ' ').title()

# Identify columns related to prior computing experiences
experience_columns = []
for col in df.columns:
    if col.startswith('prior_to_applying_to_college_did_you_participate_in_any_of_the_following_events_or_activities_at_the_county_college_of_morris_andor_with_the_department_of_information_technologies_if_at_all_'):
        experience_columns.append(col)

# Calculate percentage of students with each prior computing experience
experience_percentages = df[experience_columns].apply(calculate_yes_percentage)

# Sort experiences by percentage in descending order
experience_percentages_sorted = experience_percentages.sort_values(ascending=False)

# Format experience names for display
formatted_labels = []
for col in experience_percentages_sorted.index:
    formatted_labels.append(format_column_name(col))

# Create a bar plot to visualize the distribution of prior computing experiences
plt.figure(figsize=(12, 6))
plt.bar(range(len(formatted_labels)), experience_percentages_sorted.values, color='lightgreen')
plt.title('Prior Computing Experiences of Students')
plt.xlabel('Experience Type')
plt.ylabel('Percentage of Students (Yes Responses)')
plt.xticks(range(len(formatted_labels)), formatted_labels, rotation=45, ha='right')
plt.show()

# Print the percentage of students for each prior computing experience
print("Prior Computing Experiences:")
for label, percentage in zip(formatted_labels, experience_percentages_sorted):
    print(f"{label}: {percentage:.2f}%")

## Question 4: Is there a correlation between gender and interest level in taking more computing courses?

This code analyzes and visualizes the relationship between gender and interest level in taking more computing courses at County College of Morris.

In [None]:
# Rename the long column name to a more manageable 'interest_level'
df = df.rename(columns={'on_a_scale_of_1_to_5_with_1_being_not_at_all_interested_and_5_being_extremely_interested_how_interested_are_you_taking_more_courses_in_computer_science_information_technology_or_game_development': 'interest_level'})

# Convert 'interest_level' to numeric, replacing non-numeric values with NaN
df['interest_level'] = pd.to_numeric(df['interest_level'], errors='coerce')

# Convert float values to integers, replacing non-integer values with None
df['interest_level'] = df['interest_level'].apply(lambda x: int(x) if x.is_integer() else None)

# Filter out rows where interest_level is not between 1 and 5
df = df[(df['interest_level'] >= 1) & (df['interest_level'] <= 5)]

# Remove rows with missing values in 'interest_level' or 'gender'
df = df.dropna(subset=['interest_level', 'gender'])

# Remove rows where gender is 'Unknown'
df = df[df['gender'] != 'Unknown']

# Create a contingency table of gender vs interest level
contingency_table = pd.crosstab(df['gender'], df['interest_level'])

# Calculate percentages for each gender
contingency_table_percentage = contingency_table.div(contingency_table.sum(axis=1), axis=0) * 100

# Create a stacked bar plot to visualize interest levels by gender
plt.figure(figsize=(10, 6))
contingency_table_percentage.plot(kind='bar', stacked=True)
plt.title('Interest in Taking More Computing Courses by Gender')
plt.xlabel('Gender')
plt.ylabel('Percentage')
plt.legend(title='Interest Level', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Print the contingency table percentages
print("Contingency Table Percentages:")
print(contingency_table_percentage)

# Calculate and print the average interest level by gender
average_interest = df.groupby('gender')['interest_level'].mean()
print("\nAverage Interest Level by Gender:")
print(average_interest)