# Project data preprocessing
---



##### In this notebook, we will explore data preprocessing techniques using the student_habits_performance  dataset from Kaggle.
##### We will go through the following steps:
##### 1.Explore the dataset by:
 * Viewing random samples of data.
 *      Identifying the total number of rows and columns.
##### 2.Handle missing values by:
 *      Calculating the percentage of missing data.
 *      Deciding and implementing a method for handling missing values (e.g., filling or dropping).
##### 3.Identify and remove duplicate rows.
---

In [1]:
# Import necessary libraries
import pandas as pd

### Load the student_habits_performance dataset

In [29]:
df = pd.read_csv("student_habits_performance1.csv")


### Save the updated dataset back to the CSV file


In [69]:
df.to_csv("student_habits_performance1.csv", index=False)

### Viewing random samples of data


In [68]:
df.sample(10)

Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
335,S1335,24,1,6.5,3.5,3.9,0,100.0,7.4,0,5,0,1,7,1,100.0
213,S1213,23,1,1.1,3.1,3.8,0,88.3,6.8,2,6,1,0,6,0,45.5
338,S1338,20,0,5.0,3.4,1.6,1,80.9,6.0,2,2,1,1,1,1,67.7
113,S1113,22,1,3.0,2.5,2.6,0,83.1,5.5,1,0,1,1,4,0,51.3
810,S1810,19,1,4.4,0.2,1.4,0,81.2,5.4,2,1,0,1,10,0,80.7
207,S1207,20,0,2.1,0.5,2.0,0,80.9,7.3,0,3,1,2,6,1,62.3
782,S1782,24,0,3.9,3.3,1.1,1,90.6,8.6,1,0,0,2,3,0,56.0
948,S1948,22,0,3.0,3.0,1.6,0,70.2,6.7,1,2,2,2,7,0,63.2
539,S1539,18,1,3.8,4.1,1.6,0,100.0,7.2,2,4,0,2,6,1,81.0
605,S1605,19,1,2.2,3.5,1.8,0,100.0,3.3,2,6,1,2,8,0,67.6


### Identifying the total number of rows and columns

In [31]:
df.shape

(1000, 16)

### Calculating the percentage of missing data and handling missing values

In [47]:
df['parental_education_level'].fillna(df['parental_education_level'].mode()[0], inplace=True)

In [48]:
df.isnull().mean() * 100

student_id                       0.0
age                              0.0
gender                           0.0
study_hours_per_day              0.0
social_media_hours               0.0
netflix_hours                    0.0
part_time_job                    0.0
attendance_percentage            0.0
sleep_hours                      0.0
diet_quality                     0.0
exercise_frequency               0.0
parental_education_level         0.0
internet_quality                 0.0
mental_health_rating             0.0
extracurricular_participation    0.0
exam_score                       0.0
dtype: float64

### Identify and remove duplicate rows

In [33]:
df.duplicated().sum()
df = df.drop_duplicates()

## Data Objects and Attribute Types





#### Identifying and storing the names of DataFrame columns that contain categorical values.

In [34]:
nominal = [col for col in df.columns if df[col].dtype == 'object']


#### Identifying and storing the names of columns that have exactly two unique values.

In [35]:
binary = [col for col in df.columns if df[col].nunique() == 2]

#### Identifying and storing the names of columns with categories that follow a meaningful order

In [61]:
ordinal = ["diet_quality", "internet_quality", "parental_education_level"]

#### Convert 'gender' column values to numeric

In [67]:
df["gender"] = df["gender"].replace({"Female": 0, "Male": 1, "Other": 2}).astype(int)

#### Converting 'internet_quality', 'parental_education_level', 'diet_quality' columns to numeric 

In [66]:
df["internet_quality"] = df["internet_quality"].replace({"Poor": 0, "Average": 1, "Good": 2}).astype(int)

In [64]:
df["parental_education_level"] = df["parental_education_level"].replace({"High School": 0, "Bachelor": 1, "Master": 2}).astype(int)

In [65]:
df["diet_quality"] = df["diet_quality"].replace({"Poor": 0, "Fair": 1, "Good": 2}).astype(int)

#### Convert all 'Yes' values to 1 and 'No' values to 0 in the dataset

In [63]:
df = df.replace({"Yes": 1, "No": 0})

## Basic Statistics with NumPy

In [70]:
import numpy as np

#### We will do statistical calculations, such as:
- Mean - average value
- Median - middle value
- Standard deviation - spread of data from the mean (square root of variance)
- Variance - (standard deviation)^2
- Minimum - the smallest value
- Maximum - the biggest value 
- Sum - addition of all values 


#### Statistical calculations with 'age'

In [73]:
mean_age = np.mean(df['age'])
print("Mean:", round(mean_age))

median_age = np.median(df['age'])
print("Median:", round(median_age))

std_dev_age = np.std(df['age'], ddof=1)
print("Std Dev:", round(std_dev_age))

variance_age = np.var(df['age'])
print("Variance:", round(variance_age))

min_age = np.min(df['age'])
print("Min:", min_age)

max_age = np.max(df['age'])
print("Max:", max_age)

sum_age = np.sum(df['age'])
print("Sum:", sum_age)

Mean: 20
Median: 20
Std Dev: 2
Variance: 5
Min: 17
Max: 24
Sum: 20498


#### Statistical calculations with 'study_hours_per_day'

In [80]:
mean_study = np.mean(df['study_hours_per_day'])
print("Mean:", round(mean_study, 1))

median_study = np.median(df['study_hours_per_day'])
print("Median:", median_study)

std_dev_study = np.std(df['study_hours_per_day'], ddof=1)
print("Std Dev:", round(std_dev_study, 1))

variance_study = np.var(df['study_hours_per_day'])
print("Variance:", round(variance_study, 1))

min_study = np.min(df['study_hours_per_day'])
print("Min:", min_study)

max_study = np.max(df['study_hours_per_day'])
print("Max:", max_study)

sum_study = np.sum(df['study_hours_per_day'])
print("Sum:", round(sum_study, 1))

Mean: 3.6
Median: 3.5
Std Dev: 1.5
Variance: 2.2
Min: 0.0
Max: 8.3
Sum: 3550.1


#### Statistical calculations with 'social_media_hours'

In [82]:
mean_media = np.mean(df['social_media_hours'])
print("Mean:", round(mean_media, 1))

median_media = np.median(df['social_media_hours'])
print("Median:", median_media)

std_dev_media = np.std(df['social_media_hours'], ddof=1)
print("Std Dev:", round(std_dev_media, 1))

variance_media = np.var(df['social_media_hours'])
print("Variance:", round(variance_media, 1))

min_media = np.min(df['social_media_hours'])
print("Min:", min_media)

max_media = np.max(df['social_media_hours'])
print("Max:", max_media)

sum_media = np.sum(df['social_media_hours'])
print("Sum:", sum_media)

Mean: 2.5
Median: 2.5
Std Dev: 1.2
Variance: 1.4
Min: 0.0
Max: 7.2
Sum: 2505.5


#### Statistical calculations with 'netflix_hours'

In [83]:
mean_netflix = np.mean(df['netflix_hours'])
print("Mean:", round(mean_netflix, 1))

median_netflix = np.median(df['netflix_hours'])
print("Median:", median_netflix)

std_dev_netflix = np.std(df['netflix_hours'], ddof=1)
print("Std Dev:", round(std_dev_netflix, 1))

variance_netflix = np.var(df['netflix_hours'])
print("Variance:", round(variance_netflix, 1))

min_netflix = np.min(df['netflix_hours'])
print("Min:", min_netflix)

max_netflix = np.max(df['netflix_hours'])
print("Max:", max_netflix)

sum_netflix = np.sum(df['netflix_hours'])
print("Sum:", sum_netflix)

Mean: 1.8
Median: 1.8
Std Dev: 1.1
Variance: 1.2
Min: 0.0
Max: 5.4
Sum: 1819.7


#### Statistical calculations with 'attendance_percentage'

In [None]:
mean_netflix = np.mean(df['netflix_hours'])
print("Mean:", round(mean_netflix, 1))

median_netflix = np.median(df['netflix_hours'])
print("Median:", median_netflix)

std_dev_netflix = np.std(df['netflix_hours'], ddof=1)
print("Std Dev:", round(std_dev_netflix, 1))

variance_netflix = np.var(df['netflix_hours'])
print("Variance:", round(variance_netflix, 1))

min_netflix = np.min(df['netflix_hours'])
print("Min:", min_netflix)

max_netflix = np.max(df['netflix_hours'])
print("Max:", max_netflix)

sum_netflix = np.sum(df['netflix_hours'])
print("Sum:", sum_netflix)

Mean: 1.8
Median: 1.8
Std Dev: 1.1
Variance: 1.2
Min: 0.0
Max: 5.4
Sum: 1819.7


## Correlations between the variables 

In [None]:
print(df[['age', 'gender', 'study_hours_per_day', 'social_media_hours', 'netflix_hours', 'part_time_job', 'attendance_percentage', 
          'sleep_hours', 'diet_quality', 'exercise_frequency', 'parental_education_level', 'internet_quality', 'mental_health_rating', 
          'extracurricular_participation', 'exam_score'
          ]].corr())

In [103]:
df[df.columns[:9]].describe()

Unnamed: 0,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.498,0.561,3.5501,2.5055,1.8197,0.215,84.1317,6.4701
std,2.3081,0.574987,1.46889,1.172422,1.075118,0.411028,9.399246,1.226377
min,17.0,0.0,0.0,0.0,0.0,0.0,56.0,3.2
25%,18.75,0.0,2.6,1.7,1.0,0.0,78.0,5.6
50%,20.0,1.0,3.5,2.5,1.8,0.0,84.4,6.5
75%,23.0,1.0,4.5,3.3,2.525,0.0,91.025,7.3
max,24.0,2.0,8.3,7.2,5.4,1.0,100.0,10.0


In [100]:
df[df.columns[8:]].describe()

Unnamed: 0,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,6.4701,1.193,3.042,0.684,1.285,5.438,0.318,69.6015
std,1.226377,0.72545,2.025423,0.742088,0.726845,2.847501,0.465932,16.888564
min,3.2,0.0,0.0,0.0,0.0,1.0,0.0,18.4
25%,5.6,1.0,1.0,0.0,1.0,3.0,0.0,58.475
50%,6.5,1.0,3.0,1.0,1.0,5.0,0.0,70.5
75%,7.3,2.0,5.0,1.0,2.0,8.0,1.0,81.325
max,10.0,2.0,6.0,2.0,2.0,10.0,1.0,100.0


In [105]:
df.describe()

Unnamed: 0,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.498,0.561,3.5501,2.5055,1.8197,0.215,84.1317,6.4701,1.193,3.042,0.684,1.285,5.438,0.318,69.6015
std,2.3081,0.574987,1.46889,1.172422,1.075118,0.411028,9.399246,1.226377,0.72545,2.025423,0.742088,0.726845,2.847501,0.465932,16.888564
min,17.0,0.0,0.0,0.0,0.0,0.0,56.0,3.2,0.0,0.0,0.0,0.0,1.0,0.0,18.4
25%,18.75,0.0,2.6,1.7,1.0,0.0,78.0,5.6,1.0,1.0,0.0,1.0,3.0,0.0,58.475
50%,20.0,1.0,3.5,2.5,1.8,0.0,84.4,6.5,1.0,3.0,1.0,1.0,5.0,0.0,70.5
75%,23.0,1.0,4.5,3.3,2.525,0.0,91.025,7.3,2.0,5.0,1.0,2.0,8.0,1.0,81.325
max,24.0,2.0,8.3,7.2,5.4,1.0,100.0,10.0,2.0,6.0,2.0,2.0,10.0,1.0,100.0
