# Student Behaviour Analysis for the Career Advancement and Engagement Department

In [1]:
# Libraries
import pandas as pd
import numpy as np
import plotly as plt
import os
pd.set_option('display.max_colwidth', None)

In [2]:
# Read in data using pandas
df = pd.read_excel(open('../data/assessment.xlsx', "rb"), #There was 'utf-8' codec start byte error.  Opening in binary mode 
                   sheet_name='Student_Behaviour') #note british spelling of sheet
                   

## Cleaning Data

### Making column names consistent and concise.

In [3]:
#get list of column names
#old_cols = df.columns.to_list

dict = {
    'Certification Course' : "cert_course",
    'Gender' : 'gender',
    'Department': 'dept',
    '10th Mark': 'mark_10th',
    '12th Mark': 'mark_12th',
    'college mark': 'mark_college',
    'hobbies': 'hobbies',
    'daily studing time' : 'study_dtime',
    'prefer to study in': 'study_loc',
    'salary expectation': 'salary_expected',
    'Do you like your degree?' : 'degree_satisfaction',
    'willingness to pursue a career based on their degree  ' : 'prob_degree_career',
    'social medai & video' : 'social_video_dtime',
    'Travelling Time ' : 'travel_dtime',
    'Stress Level ' : 'stress_lvl',
    'Financial Status' : 'financial_status',
    'part-time job' : 'part_time_job'}
df.rename(columns = dict,
          inplace=True)

In [4]:
#Looking at the head
df.head(n=5)

Unnamed: 0,cert_course,gender,dept,mark_10th,mark_12th,mark_college,hobbies,study_dtime,study_loc,salary_expected,degree_satisfaction,prob_degree_career,social_video_dtime,travel_dtime,stress_lvl,financial_status,part_time_job
0,No,Male,BCA,76.0,70.0,67.0,Sports,1 - 2 Hour,Morning,55100,No,0.5,1 - 1.30 hour,30 - 60 minutes,Good,good,False
1,Yes,Male,BCA,75.0,57.0,55.0,Sports,0 - 30 minute,Morning,55100,Yes,0.5,1.30 - 2 hour,0 - 30 minutes,Good,good,False
2,Yes,Male,B.com ISM,67.0,70.0,60.0,Sports,More Than 4 hour,Anytime,55100,Yes,1.0,1 - 30 Minute,0 - 30 minutes,Good,good,False
3,Yes,Male,BCA,89.0,69.0,80.0,Video game,3 - 4 hour,Morning,55100,Yes,0.75,1 - 1.30 hour,more than 3 hour,Good,good,True
4,No,Male,BCA,80.0,70.0,60.0,Video Games,0 - 30 minute,Anytime,55200,Yes,0.25,1 - 30 Minute,0 - 30 minutes,Good,good,False


In [5]:
# checking data types and missing values
df_info = pd.DataFrame({
    "Data Type": df.dtypes,
    "Total Values": df.count(),
    "Unique Values": df.nunique(),
    "Missing Values": df.isnull().sum(),
    "Missing %": (df.isnull().sum()/df.shape[0]*100)
    })
df_info

Unnamed: 0,Data Type,Total Values,Unique Values,Missing Values,Missing %
cert_course,object,235,2,0,0.0
gender,object,235,2,0,0.0
dept,object,235,5,0,0.0
mark_10th,float64,235,68,0,0.0
mark_12th,float64,235,67,0,0.0
mark_college,float64,235,39,0,0.0
hobbies,object,235,5,0,0.0
study_dtime,object,235,6,0,0.0
study_loc,object,235,3,0,0.0
salary_expected,int64,235,84,0,0.0


### Ensuring Correct Data Types

**categorical variables**

| Variable              | Unique Values                           |
|-----------------------|--------------------------------------------------|
| `gender`              | 2                                                |
| `dept`                | 5                                                |
| `hobbies`             | 5                                                |
| `study_dtime`         | 6                                                |
| `study_loc`           | 3                                                |
| `social_video_dtime`  | 6                                                |
| `travel_dtime`        | 7                                                |
| `stress_lvl`          | 4                                                |
| `financial_status`    | 4   

In [6]:
# Categorical conversions list
cat_convert = [
        'gender',
        'dept',
        'hobbies',
        'study_dtime',
        'study_loc',
        'social_video_dtime',
        'travel_dtime',
        'stress_lvl',
        'financial_status'
        ]

# Boolean conversions list
bool_convert = [
        'cert_course',
        'degree_satisfaction'
        ]
bool_map = {
        'Yes': True,
        'No': False
        }

# Applying conversion lists
df[cat_convert] = df[cat_convert].astype('category')
df[bool_convert] = df[bool_convert].astype('bool')

In [11]:

# Looking for unique values in categorical variables
unique_values_df = pd.DataFrame([(col, df[col].unique().tolist()) for col in df.select_dtypes('category').columns],
                                columns=['Category', 'Labels'])
unique_values_df

Unnamed: 0,Category,Labels
0,gender,"[Male, Female]"
1,dept,"[BCA, B.com ISM, Commerce, B.com Accounting & Finance , B.com Accounting and Finance ]"
2,hobbies,"[Sports, Video game, Video Games, Cinema, Reading books]"
3,study_dtime,"[1 - 2 Hour, 0 - 30 minute, More Than 4 hour, 3 - 4 hour, 30 - 60 minute, 2 - 3 hour]"
4,study_loc,"[Morning, Anytime, Night]"
5,social_video_dtime,"[1 - 1.30 hour, 1.30 - 2 hour, 1 - 30 Minute, 0 Minute, 30 - 60 Minute, More than 2 hour]"
6,travel_dtime,"[30 - 60 minutes, 0 - 30 minutes, more than 3 hour, 1 - 1.30 hour, 2 - 2.30 hour, 1.30 - 2 hour, 2.30 - 3 hour]"
7,stress_lvl,"[Good, Awful, Bad, fabulous]"
8,financial_status,"[good, Bad, Fabulous, Awful]"


## Salary Expectations:
> Show Median and Average Salary expectations by department

## Degree Satisfaction by Department:
> Display the distribution of 'Do you like your degree?' across different departments


## Hobbies and Part-time Jobs:
>  Visualize the percentage of candidates with a 'part-time job' based on their hobbies


## Financial Status and Part-time Jobs:
>  Display the distribution of 'Financial Status' and the percentage of candidates with a 'part-time job'


## Degree Satisfaction:
>  Show the percentage of candidates who like their degree and are willing to pursue a career based on their degree


## Department-wise Analysis:
>  Compare the average '10th Mark', '12th Mark', and 'college mark' for different departments

In [9]:
# df.to_csv('../data/behavior.csv', index=False)