# Analyze for dataset "Student Habits vs Academic Performance" from [Kaggle](https://www.kaggle.com/datasets/jayaantanaath/student-habits-vs-academic-performance)

In [None]:
!pip install "kagglehub[pandas-datasets]"

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Download dataset
path = kagglehub.dataset_download("jayaantanaath/student-habits-vs-academic-performance")

In [16]:
import pandas as pd
data_file = path + "/student_habits_performance.csv"
data = pd.read_csv(data_file)
data

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,S1995,21,Female,2.6,0.5,1.6,No,77.0,7.5,Fair,2,High School,Good,6,Yes,76.1
996,S1996,17,Female,2.9,1.0,2.4,Yes,86.0,6.8,Poor,1,High School,Average,6,Yes,65.9
997,S1997,20,Male,3.0,2.6,1.3,No,61.9,6.5,Good,5,Bachelor,Good,9,Yes,64.4
998,S1998,24,Male,5.4,4.1,1.1,Yes,100.0,7.6,Fair,0,Bachelor,Average,1,No,69.7


In [18]:
print(data.shape)

(1000, 16)


In [23]:
print(list(data.columns))

['student_id', 'age', 'gender', 'study_hours_per_day', 'social_media_hours', 'netflix_hours', 'part_time_job', 'attendance_percentage', 'sleep_hours', 'diet_quality', 'exercise_frequency', 'parental_education_level', 'internet_quality', 'mental_health_rating', 'extracurricular_participation', 'exam_score']


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     1000 non-null   object 
 1   age                            1000 non-null   int64  
 2   gender                         1000 non-null   object 
 3   study_hours_per_day            1000 non-null   float64
 4   social_media_hours             1000 non-null   float64
 5   netflix_hours                  1000 non-null   float64
 6   part_time_job                  1000 non-null   object 
 7   attendance_percentage          1000 non-null   float64
 8   sleep_hours                    1000 non-null   float64
 9   diet_quality                   1000 non-null   object 
 10  exercise_frequency             1000 non-null   int64  
 11  parental_education_level       909 non-null    object 
 12  internet_quality               1000 non-null   ob

### Считаем распределение по полу

In [32]:
data.gender.value_counts()

gender
Female    481
Male      477
Other      42
Name: count, dtype: int64

### Выведем всех студентов с третий полом

In [38]:
others = data[data['gender'] == 'Other']
others.size
others.head()

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
18,S1018,24,Other,2.0,4.9,2.9,Yes,88.3,7.1,Good,2,High School,Good,5,No,43.9
22,S1022,18,Other,4.9,2.3,0.6,No,84.5,6.0,Fair,3,High School,Average,7,No,98.7
31,S1031,17,Other,1.5,3.1,2.6,Yes,96.2,8.0,Fair,4,Bachelor,Average,3,Yes,51.3
32,S1032,20,Other,2.6,4.9,4.3,No,82.1,6.9,Good,3,Bachelor,Good,7,No,52.1
81,S1081,20,Other,3.2,3.2,5.0,No,100.0,6.7,Good,5,High School,Poor,2,No,57.6


### Кто больше спит?

In [50]:
all_genders = data.gender.unique()
for gender in all_genders:
    sleep_hour_average = data[data['gender'] == gender]['sleep_hours'].mean()
    sleep_hour_std = data[data['gender'] == gender]['sleep_hours'].std()
    print(f"{gender} sleeps: average = {sleep_hour_average:.2f} hours, with std = {sleep_hour_std:.2f} hours")

Female sleeps: average = 6.47 hours, with std = 1.22 hours
Male sleeps: average = 6.44 hours, with std = 1.25 hours
Other sleeps: average = 6.80 hours, with std = 0.93 hours


### Количество часов на развлечение (соцсетях, Netflix) и оценка за экзамен

In [62]:
data['wasted_time'] = data.social_media_hours + data.netflix_hours
columns_to_show = ['student_id', 'wasted_time', 'mental_health_rating','exam_score']
data[columns_to_show]

Unnamed: 0,student_id,wasted_time,mental_health_rating,exam_score
0,S1000,2.3,8,56.2
1,S1001,5.1,8,100.0
2,S1002,4.4,1,34.3
3,S1003,4.9,1,26.8
4,S1004,4.9,1,66.4
...,...,...,...,...
995,S1995,2.1,6,76.1
996,S1996,3.4,6,65.9
997,S1997,3.9,9,64.4
998,S1998,5.2,1,69.7


### Средний бал за экзамен количество сдавших (> 75% от максимума)

In [65]:
average_score = data['exam_score'].mean()
max_score = data['exam_score'].max()
students_pass_exam = data[data['exam_score'] > 0.75*max_score]
students_pass_exam

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score,wasted_time
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0,5.1
5,S1005,24,Male,7.2,1.3,0.0,No,82.9,7.4,Fair,1,Master,Average,4,No,100.0,1.3
6,S1006,21,Female,5.6,1.5,1.4,Yes,85.8,6.5,Good,2,Master,Poor,4,No,89.8,2.9
8,S1008,23,Female,4.4,2.2,1.7,No,100.0,7.1,Good,3,Bachelor,Good,1,No,78.9,3.9
9,S1009,18,Female,4.8,3.1,1.3,No,95.4,7.5,Good,5,Bachelor,Good,10,Yes,100.0,4.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
982,S1982,20,Female,3.8,2.4,2.2,Yes,91.5,8.6,Fair,2,High School,Good,10,No,88.0,4.6
984,S1984,24,Male,3.2,3.0,0.1,No,97.5,6.8,Fair,6,High School,Average,7,No,82.4,3.1
985,S1985,18,Male,5.7,3.1,0.0,Yes,86.6,5.7,Fair,2,Master,Good,3,Yes,80.9,3.1
991,S1991,20,Male,6.0,2.1,3.0,No,86.7,5.1,Good,2,High School,Good,3,No,85.3,5.1


In [67]:
data['parental_education_level'].dropna()

0           Master
1      High School
2      High School
3           Master
4           Master
          ...     
995    High School
996    High School
997       Bachelor
998       Bachelor
999       Bachelor
Name: parental_education_level, Length: 909, dtype: object