# Student Behaviour Analysis for the Career Advancement and Engagement Department

## 1) - Library Imports

In [1]:
import pandas as pd
import numpy as np
import plotly as plt
import plotly.express as px
import matplotlib.pyplot as plt
import os
import dtale
import pandas_profiling
import sweetviz

pd.set_option('display.max_colwidth', None)

## 2) - Reading in Data

In [2]:
# Read in data using pandas
df = pd.read_excel(open('../data/assessment.xlsx', "rb"), #There was 'utf-8' codec start byte error.  Opening in binary mode 
                   sheet_name='Student_Behaviour') #note british spelling of sheet
                   

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 17 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   Certification Course                                    235 non-null    object 
 1   Gender                                                  235 non-null    object 
 2   Department                                              235 non-null    object 
 3   10th Mark                                               235 non-null    float64
 4   12th Mark                                               235 non-null    float64
 5   college mark                                            235 non-null    float64
 6   hobbies                                                 235 non-null    object 
 7   daily studing time                                      235 non-null    object 
 8   prefer to study in                      

In [4]:
df.head()

Unnamed: 0,Certification Course,Gender,Department,10th Mark,12th Mark,college mark,hobbies,daily studing time,prefer to study in,salary expectation,Do you like your degree?,willingness to pursue a career based on their degree,social medai & video,Travelling Time,Stress Level,Financial Status,part-time job
0,No,Male,BCA,76.0,70.0,67.0,Sports,1 - 2 Hour,Morning,55100,No,0.5,1 - 1.30 hour,30 - 60 minutes,Good,good,False
1,Yes,Male,BCA,75.0,57.0,55.0,Sports,0 - 30 minute,Morning,55100,Yes,0.5,1.30 - 2 hour,0 - 30 minutes,Good,good,False
2,Yes,Male,B.com ISM,67.0,70.0,60.0,Sports,More Than 4 hour,Anytime,55100,Yes,1.0,1 - 30 Minute,0 - 30 minutes,Good,good,False
3,Yes,Male,BCA,89.0,69.0,80.0,Video game,3 - 4 hour,Morning,55100,Yes,0.75,1 - 1.30 hour,more than 3 hour,Good,good,True
4,No,Male,BCA,80.0,70.0,60.0,Video Games,0 - 30 minute,Anytime,55200,Yes,0.25,1 - 30 Minute,0 - 30 minutes,Good,good,False


## 3) - Cleaning Data

### 3a) -  Making column names consistent and concise.

In [5]:
# Column Renaming Dictionaries
dictionary = {
    'Certification Course' : "cert_course",
    'Gender' : 'gender',
    'Department': 'department',
    '10th Mark': 'mark_10th',
    '12th Mark': 'mark_12th',
    'college mark': 'mark_college',
    'hobbies': 'hobbies',
    'daily studing time' : 'study_hr_day',
    'prefer to study in': 'study_window',
    'salary expectation': 'salary_expected',
    'Do you like your degree?' : 'degree_happy',
    'willingness to pursue a career based on their degree  ' : 'career_in_degree',
    'social medai & video' : 'social_video_hr_day',
    'Travelling Time ' : 'travel_hr_day',
    'Stress Level ' : 'stress_status',
    'Financial Status' : 'financial_status',
    'part-time job' : 'job_pt'}
df.rename(columns = dictionary,
          inplace=True)

### 3b) -  Ensuring Correct Data Types

In [6]:
# Make job_pt non-bool
df['job_pt'] = df['job_pt'].replace({True: 'Yes', False: 'No'})

# Categorical conversions list
cat_convert = [
        'gender',
        'department',
        'hobbies',
        'study_hr_day',
        'study_window',
        'social_video_hr_day',
        'travel_hr_day',
        'stress_status',
        'financial_status',
        'job_pt',
        'cert_course',
        'degree_happy'
        ]
# Applying conversion lists
df[cat_convert] = df[cat_convert].astype('category')

### 3c) -  Ensuring Correct Labeling of Categorical Data

**Inconsistencies in categorical labels**
- **dept** - `B.com Accounting & Finance ` and `B.com Accounting and Finance ` 
    - standardized and removed trailing spaces
- **hobbies** 
    - fixed caseing of labels
- **`*`_daily** 
    - time buckets addressed to ensure uniformity
    - Relabeled all categories in terms of hours
    - **social_video_hr_day** - Combined `0 Minute` and `1 - 30 Minute` buckets into new `0 to 0.5` to maintain consistency across time buckets
        - 5 students reported 0 minutes
        - 47 students reported 1-30 minutes
- **`*_status`** - Fixed capitalization.

#### 3c_1) - Category Mapping Dictionaries

In [7]:
# Manual fix - social_video_hr_day_mapping
df['social_video_hr_day'] = df['social_video_hr_day'].replace({'0 Minute': '0 - 30 minutes', '1 - 30 Minute': '0 - 30 minutes'})

# Manual fix - hobbies
df['hobbies'] = df['hobbies'].replace({'Video game': 'Video Games', 'Reading books': 'Reading Books'})

In [8]:
# Label Mapping dictionaries
department_mapping= {
    'B.com Accounting and Finance ': 'B.com Accounting & Finance',
    'B.com Accounting & Finance ': 'B.com Accounting & Finance'
}

hobby_mapping = {
    'Video Games': 'Gaming',
    'Cinema': 'Movies',
    'Reading Books': 'Reading'
}

sentiment_mapping = {
    'fabulous': 'Fabulous',
    'good': 'Good',
    'bad' : 'Bad',
    'awful': 'Awful'
}

study_hr_day_mapping = {
    '0 - 30 minute': '0 to 0.5',
    '30 - 60 minute': '0.5 to 1',
    '1 - 2 Hour': '1 to 2',
    '2 - 3 hour': '2 to 3',
    '3 - 4 hour': '3 to 4',
    'More Than 4 hour': '4+'
}

social_video_hr_day_mapping = {
    '0 - 30 minutes': '0 to 0.5',
    '30 - 60 Minute': '0.5 to 1',
    '1 - 1.30 hour': '1 to 1.5',
    '1.30 - 2 hour': '1.5 to 2',
    'More than 2 hour': '2+'
}

travel_hr_day_mapping = {
    '0 - 30 minutes': '0 to 0.5',
    '30 - 60 minutes': '0.5 to 1',
    '1 - 1.30 hour': '1 to 1.5',
    '1.30 - 2 hour': '1.5 to 2',
    '2 - 2.30 hour': '2 to 2.5',
    '2.30 - 3 hour': '2.5 to 3',
    'more than 3 hour': '3+'
}

willingess_mapping = {1: 'Eager',
                      0.75: 'Willing',
                      0.5: 'Neutral',
                      0.25: 'Reluctant',
                      0: 'Unwilling'
}

df['department'] = df['department'].replace(department_mapping)
df['stress_status'] = df['stress_status'].replace(sentiment_mapping)
df['financial_status'] = df['financial_status'].replace(sentiment_mapping)
df['study_hr_day'] = df['study_hr_day'].replace(study_hr_day_mapping)
df['social_video_hr_day'] = df['social_video_hr_day'].replace(social_video_hr_day_mapping)
df['travel_hr_day'] = df['travel_hr_day'].replace(travel_hr_day_mapping)
df['career_in_degree'] = df['career_in_degree'].replace(willingess_mapping)
df['hobbies'] = df['hobbies'].replace(hobby_mapping)

In [9]:
df.head()

Unnamed: 0,cert_course,gender,department,mark_10th,mark_12th,mark_college,hobbies,study_hr_day,study_window,salary_expected,degree_happy,career_in_degree,social_video_hr_day,travel_hr_day,stress_status,financial_status,job_pt
0,No,Male,BCA,76.0,70.0,67.0,Sports,1 to 2,Morning,55100,No,Neutral,1 to 1.5,0.5 to 1,Good,Good,No
1,Yes,Male,BCA,75.0,57.0,55.0,Sports,0 to 0.5,Morning,55100,Yes,Neutral,1.5 to 2,0 to 0.5,Good,Good,No
2,Yes,Male,B.com ISM,67.0,70.0,60.0,Sports,4+,Anytime,55100,Yes,Eager,0 to 0.5,0 to 0.5,Good,Good,No
3,Yes,Male,BCA,89.0,69.0,80.0,Gaming,3 to 4,Morning,55100,Yes,Willing,1 to 1.5,3+,Good,Good,Yes
4,No,Male,BCA,80.0,70.0,60.0,Gaming,0 to 0.5,Anytime,55200,Yes,Reluctant,0 to 0.5,0 to 0.5,Good,Good,No


#### 3c_2) - Ordering Categories

In [10]:
# Creating ordered categories

#Non-dictionary lists
y_n = ('Yes', 'No')

ordered_cats = {
    'stress_status': list(sentiment_mapping.values()),
    'financial_status': list(sentiment_mapping.values()),
    'study_hr_day': list(study_hr_day_mapping.values()),
    'social_video_hr_day': list(social_video_hr_day_mapping.values()),
    'travel_hr_day': list(travel_hr_day_mapping.values()),
    'career_in_degree': list(willingess_mapping.values()),
    'cert_course': list(y_n),
    'degree_happy': list(y_n),
    'job_pt': list(y_n)  
}

# Applying ordered categories
for col, cats in ordered_cats.items():
    df[col] = df[col].astype(pd.CategoricalDtype(categories=cats, ordered=True))
 

#### 3c_3) - (This doesn't work for my purposes) Exporting ordered categories for PowerBI

In [11]:
# # Create and save ordering tables as CSVs
# dfs_to_save = {
#     'sentiment_order': sentiment_mapping,
#     'study_hr_day_order': study_hr_day_mapping,
#     'social_video_hr_day_order': social_video_hr_day_mapping,
#     'travel_hr_day_order': travel_hr_day_mapping,
#     'willingess_order': willingess_mapping
# }

# # Combining all _hr_day mappings into one table
# hr_day_mappings = {**study_hr_day_mapping, **social_video_hr_day_mapping, **travel_hr_day_mapping}
# dfs_to_save['combined_hr_day_order'] = hr_day_mappings

# # Creating DataFrames and saving to CSV
# saved_files = []
# for filename, mapping in dfs_to_save.items():
#     df_order = pd.DataFrame({
#         'Category': list(mapping.values()),
#         'Order': range(1, len(mapping) + 1)
#     })
#     filepath = f"../data/order_tables/{filename}.csv"
#     df_order.to_csv(filepath, index=False)
#     saved_files.append(filepath)

## 4) - Overview of Clean Data

### 4.1) - Data Types and Counts

In [12]:
# checking data types and missing values
df_info = pd.DataFrame({
    "data_type": df.dtypes,
    "total_values": df.count(),
    "unique_values": df.nunique(),
    "missing_values": df.isnull().sum()
    })
df_info

Unnamed: 0,data_type,total_values,unique_values,missing_values
cert_course,category,235,2,0
gender,category,235,2,0
department,category,235,4,0
mark_10th,float64,235,68,0
mark_12th,float64,235,67,0
mark_college,float64,235,39,0
hobbies,category,235,4,0
study_hr_day,category,235,6,0
study_window,category,235,3,0
salary_expected,int64,235,84,0


### 4.2) - Categories and Their Labels

In [13]:
# Categorical Labels
qualitative = pd.DataFrame([(col, df[col].unique().tolist()) for col in df.select_dtypes('category').columns],
                                columns=['Category', 'Labels'])
qualitative

Unnamed: 0,Category,Labels
0,cert_course,"[No, Yes]"
1,gender,"[Male, Female]"
2,department,"[BCA, B.com ISM, Commerce, B.com Accounting & Finance]"
3,hobbies,"[Sports, Gaming, Movies, Reading]"
4,study_hr_day,"[1 to 2, 0 to 0.5, 4+, 3 to 4, 0.5 to 1, 2 to 3]"
5,study_window,"[Morning, Anytime, Night]"
6,degree_happy,"[No, Yes]"
7,career_in_degree,"[Neutral, Eager, Willing, Reluctant, Unwilling]"
8,social_video_hr_day,"[1 to 1.5, 1.5 to 2, 0 to 0.5, 0.5 to 1, 2+]"
9,travel_hr_day,"[0.5 to 1, 0 to 0.5, 3+, 1 to 1.5, 2 to 2.5, 1.5 to 2, 2.5 to 3]"


-----
# Code Questions
-----


## 1) - Salary Expectations:
> Challenge: Show Median and Average Salary expectations by department


**Notes:**
There is an extreme outlier in BCA that needs to be handled.
$$ Z = \frac{{x - \mu}}{{\sigma}} = \frac{{130000.0 - 60283.333333}}{{6708.228588}} = 10.39271 $$

### 1a) - Descriptive Statistics

> **Note** - The data seems to have a fairly tight distribution.  However, I notice an outlier in the `max` column of the BCA department.

In [14]:
# Salary Expectations Descriptives
df.groupby('department')['salary_expected'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
B.com Accounting & Finance,15.0,60293.333333,2877.614753,55600.0,57950.0,60500.0,62500.0,64500.0
B.com ISM,28.0,59835.714286,2668.362556,55100.0,57975.0,59700.0,62075.0,64500.0
BCA,132.0,60283.333333,6708.228588,55100.0,57450.0,59750.0,62225.0,130000.0
Commerce,60.0,59863.333333,2399.997646,55500.0,57700.0,59650.0,61325.0,64300.0



### 1b) Salary Expectations by department (Histogram)


> **Note**: The mean and median are so close as to be indistinguashable from one another when using a histogram.  A better visual will be a box plot.

In [15]:
# Histogram Median and Average expected sallary
salary_stats = df.groupby('department')['salary_expected'].agg(['median', 'mean']).reset_index()

# Creating a bar plot with both median and average salary expectations by department.
fig_salary = px.bar(salary_stats, y='department', x=['median', 'mean'], 
                    labels={'value':'Salary', 'variable':'Statistic'},
                    title='Median and Average Salary Expectations by Department')

# Update layout and axes labels
fig_salary.update_layout(barmode='stack', xaxis_title='Department', yaxis_title='Salary Expectation',
                         legend_title='Statistic')
fig_salary.show()

### 1c) Salary Expectations by department (Boxplot)

> **Notes:**
> Outlier suppressed in visual and median line calculation.  However, it was not suppressed in the dataset

In [16]:
# Salary Expectations Box Plot
fig = px.box(
    df, 
    x='department', 
    y='salary_expected',
    title='Distribution of Salary Expectations by Department',
    labels={'salary_expected': 'Salary Expected', 'department': 'Department'}
)

# Calculate the overall mean, excluding values over 70,000
overall_mean = df[df['salary_expected'] <= 70000]['salary_expected'].mean()

# Update layout for better readability
fig.update_layout(
    xaxis_title='Department',
    yaxis_title='Expected Salary',
    margin=dict(l=20, r=20, t=60, b=20),
    yaxis=dict(range=[55000, 65000]),  # Set y-axis limits,
    title={
        'text': 'Distribution of Salary Expectations by Department',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)'
)
# Add a line for the overall mean
fig.add_shape(
    type='line',
    y0=overall_mean,
    y1=overall_mean,
    x0=-0.5,
    x1=4.5,
    line=dict(
        color='red',
        width=0.8,
        dash='dash'
    )
)
# Show the plot
fig.write_html('../dashboard/plotly_html/q1_salary.html')
fig.show()

### 1d) Salary Expectations vs. Study Hours

In [17]:
# Extract ordered categories from the dictionary
ordered_categories = [study_hr_day_mapping[key] for key in sorted(study_hr_day_mapping.keys())]

df['study_hr_day'] = pd.Categorical(df['study_hr_day'], categories=ordered_categories, ordered=True)

# Visualization: Salary Expectations vs. Study Hours
fig_salary_study_hours = px.box(df, x='study_hr_day', y='salary_expected',
                                title='Salary Expectation Based on Daily Study Hours',
                                labels={'study_hr_day': 'Daily Study Hours',
                                        'salary_expected': 'Expected Salary'})

# Update layout and labels
fig_salary_study_hours.update_layout(
    title={
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    yaxis_title='Expected Salary',
    margin=dict(l=20, r=20, t=60, b=20),
    yaxis=dict(range=[55000, 65000]),  # Set y-axis limits
    xaxis=dict(categoryorder='array', categoryarray=ordered_categories), 
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)'
    # Manually setting order
)

fig_salary_study_hours.show()


## 2) - Degree Satisfaction by Department:
> Display the distribution of 'Do you like your degree?' across different departments


In [18]:
# Happiness of degree by department

# Calculating the count of each happiness level within each department
degree_happiness_count = df.groupby(['department', 'degree_happy']).size().reset_index(name='count')

# Calculating the total count per department
total_count_per_department = df['department'].value_counts().reset_index()
total_count_per_department.columns = ['department', 'total_count']

# Merging the dataframes to have count and total count in the same dataframe
degree_happiness_count = degree_happiness_count.merge(total_count_per_department, on='department')

# Calculating the percentage
degree_happiness_count['percentage'] = round((degree_happiness_count['count'] / degree_happiness_count['total_count']) * 100)

# Making a pivot table
pivot_degree_happy = degree_happiness_count.pivot_table(index='department',
                                                        columns='degree_happy',
                                                        values = 'percentage')
pivot_degree_happy

degree_happy,Yes,No
department,Unnamed: 1_level_1,Unnamed: 2_level_1
B.com Accounting & Finance,93.0,7.0
B.com ISM,93.0,7.0
BCA,88.0,12.0
Commerce,98.0,2.0


In [38]:
# Degree Satisfaction Chart

#Ordering Categories
satisfaction_department_order = ['Commerce','B.com Accounting & Finance', 'B.com ISM', 'BCA']

fig_degree_happiness = px.bar(degree_happiness_count, y='department', x='percentage',
                              color='degree_happy',
                              title="Degree Satisfaction by Department",
                              labels={'percentage': 'Percent',
                                      'degree_happy': 'Happy',
                                      'department': 'Department'},
                              barmode='stack'
)

# Update layout and labels
fig_degree_happiness.update_layout(yaxis_title='Department',
                                   xaxis_title='Percent',
                                   legend_title='Satisfied?',
                                   paper_bgcolor='rgba(0,0,0,0)',
                                   plot_bgcolor='rgba(0,0,0,0)',
                                   title={'y':0.95,
                                          'x':0.5,
                                          'xanchor': 'center',
                                          'yanchor': 'top'},
                                   yaxis=dict(categoryorder='array',
                                              categoryarray=satisfaction_department_order))
                                 
fig_degree_happiness.show()

# Saving figure as html
fig_degree_happiness.write_html('../dashboard/plotly_html/q2_satisfaction_department.html')


Do grouped barchart by department.  Bars will be happiness levels.  Could also do stacked.  Need more EDA.

In [20]:
# # Sentiment sunburst

# # Create a copy to avoid changing the original data
# df_sunburst = df

# # Map the boolean values and other categorical variables to more descriptive strings
# df_sunburst['degree_happy'] = df_sunburst['degree_happy'].map({
#     'Yes': 'Degree: Happy', 
#     'No': 'Degree: Unhappy'
# })

# # Mapping for stress_status and financial_status
# df_sunburst['stress_status'] = df_sunburst['stress_status'].map({
#     'Good': 'Stress: Good',
#     'Awful': 'Stress: Awful',
#     'Bad': 'Stress: Bad',
#     'Fabulous': 'Stress: Fabulous'
# })

# df_sunburst['financial_status'] = df_sunburst['financial_status'].map({
#     'Good': 'Finances: Good',
#     'Awful': 'Finances: Awful',
#     'Bad': 'Finances: Bad',
#     'Fabulous': 'Finances: Fabulous'
# })

# # Create a sunburst plot with Plotly Express
# fig = px.sunburst(
#     df_sunburst, 
#     path=['degree_happy', 'stress_status', 'financial_status'], 
#     title="Degree Satisfaction, Stress, and Financial Status by Department"
# )

# # Update layout for better readability
# fig.update_layout(
#     margin=dict(t=0, b=0, r=0, l=0),
# )

# # Show the plot
# fig.show()

#path=['department', 'degree_happy', 'stress_status', 'financial_status']

## 3) -  Hobbies and Part-time Jobs:
>  Visualize the percentage of candidates with a 'part-time job' based on their hobbies

In [21]:
# Hobbies and part time jobs

# Calculating the count of each part-time job status within each hobby
job_hobby_count = df.groupby(['hobbies', 'job_pt']).size().reset_index(name='count')

# Calculating the total count per hobby
total_count_per_hobby = df['hobbies'].value_counts().reset_index()
total_count_per_hobby.columns = ['hobbies', 'total_count']

# Merging the dataframes to have count and total count in the same dataframe
job_hobby_count = job_hobby_count.merge(total_count_per_hobby, on='hobbies')

# Calculating the percentage
job_hobby_count['percentage'] = round((job_hobby_count['count'] / job_hobby_count['total_count']) * 100)

# Making a pivot
pivot_job_hobby = job_hobby_count.pivot_table(index = 'job_pt',
                                              columns='hobbies',
                                              values = 'percentage')
pivot_job_hobby   

hobbies,Movies,Reading,Sports,Gaming
job_pt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Yes,14.0,25.0,19.0,14.0
No,86.0,75.0,81.0,86.0


In [40]:
# Visual Part-Time Jobs and Hobbies
fig_job_hobby = px.bar(job_hobby_count, x='percentage', y='hobbies',
                       color='job_pt',
                       title="Part-Time Jobs and Hobbies",
                       labels={'percentage': 'Percent',
                               'job_pt': 'Job',
                               'hobbies': 'Hobby'},
                       barmode='stack')

hobby_job_order = ['Movies', 'Gaming', 'Sports', 'Reading']

# Update layout and labels
fig_job_hobby.update_layout(xaxis_title='Percent',
                            yaxis_title='Hobby',
                            legend_title='Job',
                            paper_bgcolor='rgba(0,0,0,0)',
                            plot_bgcolor='rgba(0,0,0,0)',
                            title={'y':0.95,
                                   'x':0.5,
                                   'xanchor': 'center',
                                   'yanchor': 'top'},
                             yaxis=dict(categoryorder='array', categoryarray=hobby_job_order))
fig_job_hobby.show()
# Saving figure as html
fig_job_hobby.write_html('../dashboard/plotly_html/q3_job_hobby.html')




## 4) - Financial Status and Part-time Jobs:
>  Display the distribution of 'Financial Status' and the percentage of candidates with a 'part-time job'

In [23]:
# Finding Financial status of candidates with part-time jobs

# Calculating the count of each part-time job status within each financial status
job_financial_count = df.groupby(['financial_status', 'job_pt']).size().reset_index(name='count')

# Calculating the total count per financial status
total_count_per_financial_status = df['financial_status'].value_counts().reset_index()
total_count_per_financial_status.columns = ['financial_status', 'total_count']

# Merging the dataframes to have count and total count in the same dataframe
job_financial_count = job_financial_count.merge(total_count_per_financial_status, on='financial_status')

# Calculating the percentage
job_financial_count['percentage'] = round((job_financial_count['count'] / job_financial_count['total_count']) * 100)

# Creating pivot Table

job_finances_pivot = job_financial_count.pivot_table(values={'percentage', 'count'}, 
                                      index=['financial_status', 'job_pt'],
                                      aggfunc='mean')
job_finances_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percentage
financial_status,job_pt,Unnamed: 2_level_1,Unnamed: 3_level_1
Fabulous,Yes,2,50.0
Fabulous,No,2,50.0
Good,Yes,18,14.0
Good,No,111,86.0
Bad,Yes,17,19.0
Bad,No,71,81.0
Awful,Yes,4,29.0
Awful,No,10,71.0


In [41]:
# Creating a grouped bar chart
fig_job_financial = px.bar(job_financial_count,  x='percentage', y='financial_status',
                           color='job_pt',
                           title="Financial Status and Part-Time Jobs",
                           labels={'percentage': 'Percent',
                                   'job_pt': 'Job',
                                   'financial_status': 'Finances'},
                           barmode='stack')

# Ordering Categories
finance_jobs_order = ['Awful', 'Bad','Good', 'Fabulous']

# Update layout and labels
fig_job_financial.update_layout(xaxis_title='Percent',
                                yaxis_title='Financial Status',
                                legend_title='Part-Time',
                                paper_bgcolor='rgba(0,0,0,0)',
                                plot_bgcolor='rgba(0,0,0,0)',
                                title={'y':0.95,
                                       'x':0.5,
                                       'xanchor': 'center',
                                       'yanchor': 'top'},
                                yaxis=dict(categoryorder='array',    
                                           categoryarray=finance_jobs_order))
fig_job_financial.show()

# Saving figure as html
fig_job_financial.write_html('../dashboard/plotly_html/q4_job_financial.html')

## 5) - Degree Satisfaction:
>  Show the percentage of candidates who like their degree and are willing to pursue a career based on their degree


In [25]:
# Calculating the count of each career interest level within each degree happiness level
degree_happy_count = df.groupby(['degree_happy', 'career_in_degree']).size().reset_index(name='count')

# Calculating the total count per degree happiness level
total_count_per_happiness = df['degree_happy'].value_counts().reset_index()
total_count_per_happiness.columns = ['degree_happy', 'total_count']

# Merging the dataframes to have count and total count in the same dataframe
degree_happy_count = degree_happy_count.merge(total_count_per_happiness, on='degree_happy')

# Calculating the percentage
degree_happy_count['percentage'] = round((degree_happy_count['count'] / degree_happy_count['total_count']) * 100)

# Making Pivot
pivot_degree_happy = degree_happy_count.pivot_table(values={'percentage', 'count'}, 
                                      index=['degree_happy', 'career_in_degree'],
                                      aggfunc='mean')
pivot_degree_happy

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percentage
degree_happy,career_in_degree,Unnamed: 2_level_1,Unnamed: 3_level_1
Yes,Eager,42,20.0
Yes,Willing,110,51.0
Yes,Neutral,54,25.0
Yes,Reluctant,8,4.0
Yes,Unwilling,1,0.0
No,Eager,2,10.0
No,Willing,4,20.0
No,Neutral,7,35.0
No,Reluctant,6,30.0
No,Unwilling,1,5.0


In [26]:
# Degree Satisfaction Visual
persue_career_order = ['Unwilling','Reluctant', 'Neutral', 'Willing', 'Eager']

# Creating a grouped bar chart
fig = px.bar(degree_happy_count, x='percentage', y='degree_happy',
             color='career_in_degree',
             title="Distribution of Candidates by Degree Happiness and Career Pursuit",
             labels={'percentage': 'Percentage of Candidates',
                     'career_in_degree': 'Likelihood of Career in Degree',
                     'degree_happy': 'Degree Happiness'},
             barmode='relative')

# Update layout and labels
fig.update_layout(xaxis_title='Percent',
                  yaxis_title='Degree Satisfaction',
                  legend_title='Career in Degree',
                  paper_bgcolor='rgba(0,0,0,0)',
                  plot_bgcolor='rgba(0,0,0,0)',
                  title={'y':0.95,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  yaxis=dict(categoryorder='array',    
                             categoryarray=persue_career_order))
fig.show()

In [27]:
# Happiness with degree and eagerness to persue a career in that degree
happy_likely_neutral = degree_happy_count

happy_likely_neutral_map = {
    'Eager': 'Likely',
    'Willing': 'Likely',
    'Neutral': 'Neutral',
    'Reluctant': 'Unlikely',
    'Unwilling': 'Unlikely'
}
#Apply mapping
happy_likely_neutral['career_in_degree'] = happy_likely_neutral['career_in_degree'].replace(happy_likely_neutral_map)

# Finding counts of likely and unlikely categories
happy_likely_neutral = happy_likely_neutral.groupby(['degree_happy', 'career_in_degree']).sum().reset_index()

# Creating pivot
pivot_likely_neutral = happy_likely_neutral.pivot_table(values={'percentage', 'count'}, 
                                      index=['degree_happy', 'career_in_degree'],
                                      aggfunc='mean')
pivot_likely_neutral

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percentage
degree_happy,career_in_degree,Unnamed: 2_level_1,Unnamed: 3_level_1
Yes,Likely,152,71.0
Yes,Neutral,54,25.0
Yes,Unlikely,9,4.0
No,Likely,6,30.0
No,Neutral,7,35.0
No,Unlikely,7,35.0


In [70]:
paper_bg='rgba(0,0,0,0)'
plot_bg='rgba(0,0,0,0)'
color_list = ['#2ee58b', '#dd25ac', '#a6ee67']

# Creating a grouped bar chart
fig_happy_likely_1 = px.bar(happy_likely_neutral, 
                            x='percentage', 
                            y='degree_happy',
                            color='career_in_degree',
                            title="Degree Happiness and Career Pursuit",
                            labels={'percentage': 'Percent',
                                    'career_in_degree': 'Career in Degree',
                                    'degree_happy': 'Happy With Degree'},
                            barmode='relative',
                            color_discrete_sequence=color_list)  # Modified here

happy_likely_order = ['No', 'Yes']

# Update layout and labels
fig_happy_likely_1.update_layout(xaxis_title='Percent',
                                 yaxis_title='Degree Satisfaction',
                                 legend_title='Career in Degree',
                                 paper_bgcolor=paper_bg,  # Modified here
                                 plot_bgcolor=plot_bg,    # Modified here
                                 yaxis=dict(categoryorder='array',
                                            categoryarray=happy_likely_order))
fig_happy_likely_1.show()

# Saving figure as html
fig_happy_likely_1.write_html('../dashboard/plotly_html/q5_happy_likely_1.html')

In [29]:
# Happiness with degree and eagerness to persue a career in that degree
happy_likely_binary = degree_happy_count

happy_likely_binary_map = {
    'Eager': 'Likely',
    'Willing': 'Likely',
    'Neutral': 'Likely',
    'Reluctant': 'Unlikely',
    'Unwilling': 'Unlikely'
}
#Apply mapping
happy_likely_binary['career_in_degree'] = happy_likely_binary['career_in_degree'].replace(happy_likely_binary_map)

# Finding counts of likely and unlikely categories
happy_likely_binary = happy_likely_binary.groupby(['degree_happy', 'career_in_degree']).sum().reset_index()

# Creating pivot
pivot_likely_binary = happy_likely_binary.pivot_table(values={'percentage', 'count'}, 
                                      index=['degree_happy', 'career_in_degree'],
                                      aggfunc='mean')
pivot_likely_binary

Unnamed: 0_level_0,Unnamed: 1_level_0,count,percentage
degree_happy,career_in_degree,Unnamed: 2_level_1,Unnamed: 3_level_1
Yes,Likely,206,96.0
Yes,Unlikely,9,4.0
No,Likely,13,65.0
No,Unlikely,7,35.0


In [57]:
# Creating a grouped bar chart
fig_happy_likely_2 = px.bar(happy_likely_binary, x='percentage', y='degree_happy',
             color='career_in_degree',
             title="Distribution of Candidates by Degree Happiness and Career Pursuit",
             labels={'percentage': 'Percentage of Candidates',
                     'career_in_degree': 'Likelihood of Career in Degree',
                     'degree_happy': 'Degree Happiness'},
             barmode='relative')
happy_likely_order = ['No', 'Yes']
# Update layout and labels
fig_happy_likely_2.update_layout(xaxis_title='Percent',
                  yaxis_title='Degree Satisfaction',
                  legend_title='Career in Degree',
                  paper_bgcolor='rgba(0,0,0,0)',
                  plot_bgcolor='rgba(0,0,0,0)',
                  title={'y':0.95,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  yaxis=dict(categoryorder='array',    
                             categoryarray=happy_likely_order))
fig.show()

# Saving figure as html
fig_happy_likely_2.write_html('../dashboard/plotly_html/q5_happy_likely_2.html')

## 6) - Department-wise Analysis:
>  Compare the average '10th Mark', '12th Mark', and 'college mark' for different departments

In [31]:
# Calculate the average marks for 10th, 12th, and college grouped by department
avg_marks = df.groupby('department')[['mark_10th', 'mark_12th', 'mark_college']].mean().reset_index()

# Convert the DataFrame from wide format to long format for plotting
avg_marks_long = avg_marks.melt(id_vars='department', value_vars=['mark_10th', 'mark_12th', 'mark_college'],
                                var_name='exam', value_name='average_mark')

In [32]:

# Creating a bar chart
fig = px.bar(avg_marks_long, y='department', x='average_mark', color='exam',
             title="Average 10th, 12th, and College Marks by Department",
             labels={'average_mark': 'Average Mark',
                     'exam': 'Exam',
                     'department': 'Department'},
             barmode='group')

# Update layout and labels
fig.update_layout(xaxis_title='Department',
                  yaxis_title='Average Mark',
                  legend_title='Mark')

fig.show()

## 7) - **BONUS**
> **Supplementing** the provided prompts, include any **additional charts** or **visualizations** that shed light on their *inclination to pursue a career aligned with their degree* and their *level of satisfaction with the chosen degree*

In [33]:
# df.to_parquet('../data/behavior.parquet')
# df.to_pickle('../data/behavior.pkl')
# df.to_csv('../data/behavior.csv', index=False)

In [34]:
## Keep to the basics.  Funky stuff can be fun, but stick with basics.
############# use bubble plot if there's time for funk
## Get data in good shape.

-----------
# Auto EDA
_________

In [35]:
#D Tale
dtale.show(df)







In [36]:
#sv_report = sweetviz.analyze(df)

In [37]:
#sv_report.show_html('sv_report.html')