<a href="https://www.kaggle.com/code/ahmedanwar89/students-academic-performance-dataset-eda?scriptVersionId=149700472" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import Dataset

In [2]:
performance = pd.read_csv('/kaggle/input/xAPI-Edu-Data/xAPI-Edu-Data.csv')

In [3]:
performance.head(5)

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


# Data Cleaning

In [4]:
# check columns name and white spaces

performance.columns

Index(['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 'GradeID',
       'SectionID', 'Topic', 'Semester', 'Relation', 'raisedhands',
       'VisITedResources', 'AnnouncementsView', 'Discussion',
       'ParentAnsweringSurvey', 'ParentschoolSatisfaction',
       'StudentAbsenceDays', 'Class'],
      dtype='object')

In [5]:
# change column names by best practice situation

performance.columns = ['gender', 'nationality', 'place_of_birth', 'stage_id', 'grade_id',
       'section_id', 'topic', 'semester', 'relation', 'raised_hands',
       'visited_resources', 'announcements_view', 'discussion',
       'parent_answering_survey', 'parent_school_satisfaction',
       'student_absence_days', 'class']

In [6]:
# check again

performance.columns

Index(['gender', 'nationality', 'place_of_birth', 'stage_id', 'grade_id',
       'section_id', 'topic', 'semester', 'relation', 'raised_hands',
       'visited_resources', 'announcements_view', 'discussion',
       'parent_answering_survey', 'parent_school_satisfaction',
       'student_absence_days', 'class'],
      dtype='object')

In [7]:
performance.head(2)

Unnamed: 0,gender,nationality,place_of_birth,stage_id,grade_id,section_id,topic,semester,relation,raised_hands,visited_resources,announcements_view,discussion,parent_answering_survey,parent_school_satisfaction,student_absence_days,class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M


In [8]:
# check data type

performance.dtypes

gender                        object
nationality                   object
place_of_birth                object
stage_id                      object
grade_id                      object
section_id                    object
topic                         object
semester                      object
relation                      object
raised_hands                   int64
visited_resources              int64
announcements_view             int64
discussion                     int64
parent_answering_survey       object
parent_school_satisfaction    object
student_absence_days          object
class                         object
dtype: object

In [9]:
# check duplicated values

performance.duplicated().any(), performance.duplicated().sum()

(True, 2)

In [10]:
# drop duplicated values

performance.drop_duplicates(inplace=True)

In [11]:
# check duplicated values again

performance.duplicated().any(), performance.duplicated().sum()

(False, 0)

In [12]:
# check null values

performance.isnull().any(), performance.isnull().sum()

(gender                        False
 nationality                   False
 place_of_birth                False
 stage_id                      False
 grade_id                      False
 section_id                    False
 topic                         False
 semester                      False
 relation                      False
 raised_hands                  False
 visited_resources             False
 announcements_view            False
 discussion                    False
 parent_answering_survey       False
 parent_school_satisfaction    False
 student_absence_days          False
 class                         False
 dtype: bool,
 gender                        0
 nationality                   0
 place_of_birth                0
 stage_id                      0
 grade_id                      0
 section_id                    0
 topic                         0
 semester                      0
 relation                      0
 raised_hands                  0
 visited_resources         

In [13]:
# check data validity of object columns

performance.select_dtypes(include='object').nunique()

gender                         2
nationality                   14
place_of_birth                14
stage_id                       3
grade_id                      10
section_id                     3
topic                         12
semester                       2
relation                       2
parent_answering_survey        2
parent_school_satisfaction     2
student_absence_days           2
class                          3
dtype: int64

In [14]:
# print every unique value for each object column in a for loop start is 0, end is len() of object column list, step increase by 1

for x in np.arange(0, len(performance.select_dtypes(include='object').columns), 1) :
    print( performance.select_dtypes(include='object').columns[x])
    print(performance.select_dtypes(include='object')[performance.select_dtypes(include='object').columns[x]].unique())

gender
['M' 'F']
nationality
['KW' 'lebanon' 'Egypt' 'SaudiArabia' 'USA' 'Jordan' 'venzuela' 'Iran'
 'Tunis' 'Morocco' 'Syria' 'Palestine' 'Iraq' 'Lybia']
place_of_birth
['KuwaIT' 'lebanon' 'Egypt' 'SaudiArabia' 'USA' 'Jordan' 'venzuela' 'Iran'
 'Tunis' 'Morocco' 'Syria' 'Iraq' 'Palestine' 'Lybia']
stage_id
['lowerlevel' 'MiddleSchool' 'HighSchool']
grade_id
['G-04' 'G-07' 'G-08' 'G-06' 'G-05' 'G-09' 'G-12' 'G-11' 'G-10' 'G-02']
section_id
['A' 'B' 'C']
topic
['IT' 'Math' 'Arabic' 'Science' 'English' 'Quran' 'Spanish' 'French'
 'History' 'Biology' 'Chemistry' 'Geology']
semester
['F' 'S']
relation
['Father' 'Mum']
parent_answering_survey
['Yes' 'No']
parent_school_satisfaction
['Good' 'Bad']
student_absence_days
['Under-7' 'Above-7']
class
['M' 'L' 'H']


In [15]:
# change data by best practice situation

for x in performance.index :
    if performance.loc[x, 'nationality'] == 'KW' :
        performance.loc[x, 'nationality'] = 'Kuwait'
    elif performance.loc[x, 'nationality'] == 'lebanon' :
        performance.loc[x, 'nationality'] = 'Lebanon'

for x in performance.index :
    if performance.loc[x, 'place_of_birth'] == 'KuwaIT' :
        performance.loc[x, 'place_of_birth'] = 'Kuwait'
    elif performance.loc[x, 'place_of_birth'] == 'lebanon' :
        performance.loc[x, 'place_of_birth'] = 'Lebanon'

for x in performance.index :
    if performance.loc[x, 'gender'] == 'M' :
        performance.loc[x, 'gender'] = 'Male'
    elif performance.loc[x, 'gender'] == 'F' :
        performance.loc[x, 'gender'] = 'Female'

for x in performance.index :
    if performance.loc[x, 'semester'] == 'F' :
        performance.loc[x, 'semester'] = 'First'
    elif performance.loc[x, 'semester'] == 'S' :
        performance.loc[x, 'semester'] = 'Second'

for x in performance.index :
    if performance.loc[x, 'class'] == 'M' :
        performance.loc[x, 'class'] = 'Middle'
    elif performance.loc[x, 'class'] == 'L' :
        performance.loc[x, 'class'] = 'Low'
    elif performance.loc[x, 'class'] == 'H' :
        performance.loc[x, 'class'] = 'High'

In [16]:
# check again and print every unique value for each object column in a for loop start is 0, end is len() of object column list, step increase by 1

for x in np.arange(0, len(performance.select_dtypes(include='object').columns), 1) :
    print( performance.select_dtypes(include='object').columns[x])
    print(performance.select_dtypes(include='object')[performance.select_dtypes(include='object').columns[x]].unique())

gender
['Male' 'Female']
nationality
['Kuwait' 'Lebanon' 'Egypt' 'SaudiArabia' 'USA' 'Jordan' 'venzuela' 'Iran'
 'Tunis' 'Morocco' 'Syria' 'Palestine' 'Iraq' 'Lybia']
place_of_birth
['Kuwait' 'Lebanon' 'Egypt' 'SaudiArabia' 'USA' 'Jordan' 'venzuela' 'Iran'
 'Tunis' 'Morocco' 'Syria' 'Iraq' 'Palestine' 'Lybia']
stage_id
['lowerlevel' 'MiddleSchool' 'HighSchool']
grade_id
['G-04' 'G-07' 'G-08' 'G-06' 'G-05' 'G-09' 'G-12' 'G-11' 'G-10' 'G-02']
section_id
['A' 'B' 'C']
topic
['IT' 'Math' 'Arabic' 'Science' 'English' 'Quran' 'Spanish' 'French'
 'History' 'Biology' 'Chemistry' 'Geology']
semester
['First' 'Second']
relation
['Father' 'Mum']
parent_answering_survey
['Yes' 'No']
parent_school_satisfaction
['Good' 'Bad']
student_absence_days
['Under-7' 'Above-7']
class
['Middle' 'Low' 'High']


In [17]:
# check outliers

fig = make_subplots(rows=1,\
                    cols=len(performance.select_dtypes(exclude='object').columns),\
                    shared_yaxes=False)

for i in np.arange(0, len(performance.select_dtypes(exclude='object').columns), 1) :

    fig.add_trace(go.Box(y=performance.select_dtypes(exclude='object')[performance.select_dtypes(exclude='object').columns[i]],\
                         name= performance.select_dtypes(exclude='object').columns[i],\
                         boxpoints='suspectedoutliers'),\
                  row=1,\
                  col=i+1)

fig.show()

# Data Analysis

In [18]:
px.imshow(performance.select_dtypes(exclude='object').corr(numeric_only=True),\
          text_auto=True,\
          color_continuous_scale='Blues',\
          aspect=True,\
          title='Correlation Coefficient Between Numerical Data',\
          height=400,\
          width=800)

# Relations  
**good relation between visited_resources & raised_hands 0.69**  
**good relation between announcements_view & raised_hands 0.64**  
**good relation between visited_resources & announcements_view 0.59**


In [19]:
# Relation Between Num of Times Student Visted Resources & Num of Times Student Raised His Hand

px.scatter(data_frame=performance,\
           x='visited_resources',\
           y='raised_hands',\
           trendline='ols',\
           marginal_x='histogram',\
           marginal_y='histogram',\
           title='Relation Between Num of Times Student Visted Resources & Num of Times Student Raised His Hand')

# Insights  
**most visited resources times between 80 to 89 times**  
**most raised hands times between 10 to 19 times**  
**every time visted resources times increased the raised hands increased**

In [20]:
# Relation Between Num of Times Student Viewed Announcements & Num of Times Student Raised His Hand

px.scatter(data_frame=performance,\
           x='announcements_view',\
           y='raised_hands',\
           trendline='ols',\
           marginal_x='histogram',\
           marginal_y='histogram',\
           title='Relation Between Num of Times Student Viewed Announcements & Num of Times Student Raised His Hand')

# Insights  
**most announcements_view times between 10 to 14 times**  
**most raised hands times between 10 to 19 times**  
**every time visted resources times increased the raised hands increased**

In [21]:
# relation between visited resourses times & announcements view times regarding to the gender of student

px.scatter(data_frame=performance,\
           x='visited_resources',\
           y='announcements_view',\
           size='raised_hands',\
           color='gender',\
           marginal_x='histogram',\
           marginal_y='histogram',\
           title='relation between visited resourses times & announcements view times regarding to the gender of student')

# Insights  
**most visited resources times from male between 80 to 89 times**  
**most visited resources times from female between 80 to 89 times**  
**most announcements view times from male is between 10 to 14 times**  
**most announcements view times from female is between 50 to 54 times**

In [22]:
# relation between visited resourses times & announcements view times regarding to the nationality of student

px.scatter(data_frame=performance,\
           x='visited_resources',\
           y='announcements_view',\
           size='raised_hands',\
           color='nationality',\
           title='relation between visited resourses times & announcements view times regarding to the nationality of student')

# Insights  
**maximum of announcement view times records were by saudi arabia students with 100 raised hands times**  
**maximum of visited resources times records was by Kuwait & Jordan students with 70 & 85 raised hands times respectively**

In [23]:
# relation between visited resourses times & announcements view times regarding to the gender of student & his nationality

px.scatter(data_frame=performance,\
           x='visited_resources',\
           y='announcements_view',\
           size='raised_hands',\
           color='nationality',\
           facet_col='gender',\
           title='relation between visited resourses times & announcements view times regarding to the gender of student & his nationality')

# Insights  
**maximum of announcement view times records were by Kuwait male students with 72 raised hands times and 80 visited resources times**  
**maximum of visited resources times records were by Palestine male students with 90 raised hands times and 41 announcement view times**  
**maximum of announcement view times records were by Saudi Arabia female students with 100 raised hands times and 91 visited resources times**  
**maximum of visited resources times records were by Jordan female students with 85 raised hands times and 42 announcement view times**

In [24]:
# count of students for each gender

px.bar(performance.groupby('gender').agg({'gender': 'count'}),\
       color=performance.groupby('gender').agg({'gender': 'count'}).index,\
       text_auto=True,\
       title='count of students for each gender',\
       height=400,\
       width=800)

# Insights  
**Male studints are more than Female**

In [25]:
# count of students for each gender regarding to nationality

px.bar(performance.pivot_table(index='nationality', columns='gender', values='place_of_birth', aggfunc='count'),\
       barmode='group',\
       title='count of students for each gender regarding to nationality',\
       text_auto=True,\
       height=400)

# Insights  
**most common nationalities of students are Kuwait and Jordan**

In [26]:
# relation between visited resourses times & announcements view times regarding to the gender of student & his nationality & topic

px.scatter(data_frame=performance,\
           x='visited_resources',\
           y='announcements_view',\
           size='raised_hands',\
           color='nationality',\
           facet_col='gender',\
           facet_row='topic',\
           title='relation between visited resourses times & announcements view times regarding to the gender of student & his nationality & topic',\
           height=6000)