# Student Behaviour Analysis for the Career Advancement and Engagement Department

In [1]:
# Libraries
import pandas as pd
import numpy as np
import plotly as plt
import os
pd.set_option('display.max_colwidth', None)

In [2]:
# Read in data using pandas
df = pd.read_excel(open('../data/assessment.xlsx', "rb"), #There was 'utf-8' codec start byte error.  Opening in binary mode 
                   sheet_name='Student_Behaviour') #note british spelling of sheet
                   

## Cleaning Data

### Making column names consistent and concise.

In [3]:
#get list of column names
#old_cols = df.columns.to_list

dict = {
    'Certification Course' : "cert_course",
    'Gender' : 'gender',
    'Department': 'department',
    '10th Mark': 'mark_10th',
    '12th Mark': 'mark_12th',
    'college mark': 'mark_college',
    'hobbies': 'hobbies',
    'daily studing time' : 'study_hr_day',
    'prefer to study in': 'study_window',
    'salary expectation': 'salary_expected',
    'Do you like your degree?' : 'degree_happy',
    'willingness to pursue a career based on their degree  ' : 'career_in_degree',
    'social medai & video' : 'social_video_hr_day',
    'Travelling Time ' : 'travel_hr_day',
    'Stress Level ' : 'stress_status',
    'Financial Status' : 'financial_status',
    'part-time job' : 'job_pt'}
df.rename(columns = dict,
          inplace=True)

In [4]:
#Looking at the head
df.head(n=5)

Unnamed: 0,cert_course,gender,department,mark_10th,mark_12th,mark_college,hobbies,study_hr_day,study_window,salary_expected,degree_happy,career_in_degree,social_video_hr_day,travel_hr_day,stress_status,financial_status,job_pt
0,No,Male,BCA,76.0,70.0,67.0,Sports,1 - 2 Hour,Morning,55100,No,0.5,1 - 1.30 hour,30 - 60 minutes,Good,good,False
1,Yes,Male,BCA,75.0,57.0,55.0,Sports,0 - 30 minute,Morning,55100,Yes,0.5,1.30 - 2 hour,0 - 30 minutes,Good,good,False
2,Yes,Male,B.com ISM,67.0,70.0,60.0,Sports,More Than 4 hour,Anytime,55100,Yes,1.0,1 - 30 Minute,0 - 30 minutes,Good,good,False
3,Yes,Male,BCA,89.0,69.0,80.0,Video game,3 - 4 hour,Morning,55100,Yes,0.75,1 - 1.30 hour,more than 3 hour,Good,good,True
4,No,Male,BCA,80.0,70.0,60.0,Video Games,0 - 30 minute,Anytime,55200,Yes,0.25,1 - 30 Minute,0 - 30 minutes,Good,good,False


In [5]:
# checking data types and missing values
df_info = pd.DataFrame({
    "data_type": df.dtypes,
    "total_values": df.count(),
    "unique_values": df.nunique(),
    "missing_values": df.isnull().sum()
    })
df_info

Unnamed: 0,data_type,total_values,unique_values,missing_values
cert_course,object,235,2,0
gender,object,235,2,0
department,object,235,5,0
mark_10th,float64,235,68,0
mark_12th,float64,235,67,0
mark_college,float64,235,39,0
hobbies,object,235,5,0
study_hr_day,object,235,6,0
study_window,object,235,3,0
salary_expected,int64,235,84,0


### Ensuring Correct Data Types

**categorical variables**

| Variable              | Unique Values                           |
|-----------------------|--------------------------------------------------|
| `gender`              | 2                                                |
| `dept`                | 5                                                |
| `hobbies`             | 5                                                |
| `study_dtime`         | 6                                                |
| `study_loc`           | 3                                                |
| `social_video_dtime`  | 6                                                |
| `travel_dtime`        | 7                                                |
| `stress_lvl`          | 4                                                |
| `financial_status`    | 4   

In [6]:
# Categorical conversions list
cat_convert = [
        'gender',
        'department',
        'hobbies',
        'study_hr_day',
        'study_window',
        'social_video_hr_day',
        'travel_hr_day',
        'stress_status',
        'financial_status'
        ]

# Boolean conversion list
bool_convert = [
        'cert_course',
        'degree_happy'
        ]
bool_map = {
        'Yes': True,
        'No': False
        }

# Applying conversion lists
df[cat_convert] = df[cat_convert].astype('category')
df[bool_convert] = df[bool_convert].astype('bool')

In [7]:
df.head(n=5)

Unnamed: 0,cert_course,gender,department,mark_10th,mark_12th,mark_college,hobbies,study_hr_day,study_window,salary_expected,degree_happy,career_in_degree,social_video_hr_day,travel_hr_day,stress_status,financial_status,job_pt
0,True,Male,BCA,76.0,70.0,67.0,Sports,1 - 2 Hour,Morning,55100,True,0.5,1 - 1.30 hour,30 - 60 minutes,Good,good,False
1,True,Male,BCA,75.0,57.0,55.0,Sports,0 - 30 minute,Morning,55100,True,0.5,1.30 - 2 hour,0 - 30 minutes,Good,good,False
2,True,Male,B.com ISM,67.0,70.0,60.0,Sports,More Than 4 hour,Anytime,55100,True,1.0,1 - 30 Minute,0 - 30 minutes,Good,good,False
3,True,Male,BCA,89.0,69.0,80.0,Video game,3 - 4 hour,Morning,55100,True,0.75,1 - 1.30 hour,more than 3 hour,Good,good,True
4,True,Male,BCA,80.0,70.0,60.0,Video Games,0 - 30 minute,Anytime,55200,True,0.25,1 - 30 Minute,0 - 30 minutes,Good,good,False


### Ensuring Correct Labeling of Categorical Data

Inconsistencies in categorical labels
- **dept** - `B.com Accounting & Finance` and `B.com Accounting and Finance`
- **hobbies** - `Video game` and `Video Games`
- **`*`_daily** time buckets need to be addressed to ensure uniformity
- **social_video_hr_day** - Incorporated `0 Minute` and `1 - 30 Minute` into `0 to 0.5` to maintain consistency across time buckets
    - 5 students who reported 0 minutes
    - 47 students reported 1-30 minutes

In [8]:
# Looking for unique values in categorical variables
unique_values_df = pd.DataFrame([(col, df[col].unique().tolist()) for col in df.select_dtypes('category').columns],
                                columns=['Category', 'Labels'])
# capatilize labels
unique_values_df

Unnamed: 0,Category,Labels
0,gender,"[Male, Female]"
1,department,"[BCA, B.com ISM, Commerce, B.com Accounting & Finance , B.com Accounting and Finance ]"
2,hobbies,"[Sports, Video game, Video Games, Cinema, Reading books]"
3,study_hr_day,"[1 - 2 Hour, 0 - 30 minute, More Than 4 hour, 3 - 4 hour, 30 - 60 minute, 2 - 3 hour]"
4,study_window,"[Morning, Anytime, Night]"
5,social_video_hr_day,"[1 - 1.30 hour, 1.30 - 2 hour, 1 - 30 Minute, 0 Minute, 30 - 60 Minute, More than 2 hour]"
6,travel_hr_day,"[30 - 60 minutes, 0 - 30 minutes, more than 3 hour, 1 - 1.30 hour, 2 - 2.30 hour, 1.30 - 2 hour, 2.30 - 3 hour]"
7,stress_status,"[Good, Awful, Bad, fabulous]"
8,financial_status,"[good, Bad, Fabulous, Awful]"


#### Deciding what to do with `social_video_daily` lower buckets

In [14]:
df[df.loc['social_video_hr_day'] == '0 Minute'].count()

KeyError: 'social_video_hr_day'

In [None]:
df[df['social_video_hr_day'] == '1 - 30 Minute'].count()

cert_course           47
gender                47
department            47
mark_10th             47
mark_12th             47
mark_college          47
hobbies               47
study_daily           47
study_time_pref       47
salary_expected       47
degree_happy          47
career_in_degree      47
social_video_daily    47
travel_daily          47
stress_status         47
financial_status      47
job_pt                47
dtype: int64

In [None]:
# Mapping dictionaries for renaming labels
study_daily_mapping = {
    '0 - 30 minute': '0 to 0.5',
    '30 - 60 minute': '0.5 to 1',
    '1 - 2 Hour': '1 to 2',
    '2 - 3 hour': '2 to 3',
    '3 - 4 hour': '3 to 4',
    'More Than 4 hour': '4+'
}

social_video_daily_mapping = {
    '0 Minute': '0',
    '1 - 30 Minute': '1 to 1.5',
    '30 - 60 Minute': '0.5 to 1',
    '1 - 1.30 hour': '1 to 1.5',
    '1.30 - 2 hour': '1.5 to 2',
    'More than 2 hour': '2+'
}

travel_daily_mapping = {
    '0 - 30 minutes': '0 to 0.5',
    '30 - 60 minutes': '0.5 to 1',
    '1 - 1.30 hour': '1 to 1.5',
    '1.30 - 2 hour': '1.5 to 2',
    '2 - 2.30 hour': '2 to 2.5',
    '2.30 - 3 hour': '2.5 to 3',
    'more than 3 hour': '3+'
}

In [None]:
# Applying Mapping Dictionaries
 

## Salary Expectations:
> Show Median and Average Salary expectations by department

## Degree Satisfaction by Department:
> Display the distribution of 'Do you like your degree?' across different departments


## Hobbies and Part-time Jobs:
>  Visualize the percentage of candidates with a 'part-time job' based on their hobbies


## Financial Status and Part-time Jobs:
>  Display the distribution of 'Financial Status' and the percentage of candidates with a 'part-time job'


## Degree Satisfaction:
>  Show the percentage of candidates who like their degree and are willing to pursue a career based on their degree


## Department-wise Analysis:
>  Compare the average '10th Mark', '12th Mark', and 'college mark' for different departments

## *BONUS
> **Supplementing** the provided prompts, include any **additional charts** or **visualizations** that shed light on their *inclination to pursue a career aligned with their degree* and their *level of satisfaction with the chosen degree*

In [None]:
# df.to_csv('../data/behavior.csv', index=False)