# Student Data Case Study - Summary


In [1]:
## Libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

In [5]:
## Import data

In [7]:
url = 'https://raw.githubusercontent.com/DUanalytics/datasets/master/csv/studentdata.csv'
student = pd.read_csv(url)
student.info()
print(student.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rollno  100 non-null    int64 
 1   name    100 non-null    object
 2   dob     100 non-null    object
 3   gender  100 non-null    object
 4   course  100 non-null    object
 5   batch   100 non-null    int64 
 6   EXCEL   100 non-null    int64 
 7   MATHS   100 non-null    int64 
 8   PYTHON  100 non-null    int64 
 9   RPGM    100 non-null    int64 
 10  STATS   100 non-null    int64 
 11  age     100 non-null    int64 
dtypes: int64(8), object(4)
memory usage: 9.5+ KB
   rollno name         dob gender course  batch  EXCEL  MATHS  PYTHON  RPGM  \
0       1  XYZ  1994-01-01      M  BSCDS   2016     87     92      79    42   
1       2  XYZ  1994-01-08      M  BSCDS   2017     96    102      89    77   
2       3  XYZ  1994-01-15      M  BSCDS   2016     87    133      89    61   
3       4  XYZ  1994-01-22

In [18]:
# 1 column, 1 stats
print(student['STATS'].min())
print(student['STATS'].max())
# mean(), std(), var()
# 2 columns, 1 stats
print(student[['STATS','PYTHON']].std())

# 2 columns, 3 stats
student[["PYTHON", 'STATS']].apply( { "PYTHON" : ["mean","median","max"],
                                     "STATS" : ["mean","median","max"]  })

91
135
STATS      9.504592
PYTHON    10.897131
dtype: float64


Unnamed: 0,PYTHON,STATS
mean,75.4,110.81
median,75.5,110.0
max,107.0,135.0


## Group by


In [20]:
student.groupby('gender')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B1153F74C0>

In [23]:
student.groupby('gender').groups

{'F': [4, 13, 15, 25, 27, 28, 38, 39, 59, 60, 71, 80, 85, 89, 91], 'M': [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 29, 30, 31, 32, 33, 34, 35, 36, 37, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 86, 87, 88, 90, 92, 93, 94, 95, 96, 97, 98, 99]}

In [24]:
student.groupby('gender').get_group('M')

Unnamed: 0,rollno,name,dob,gender,course,batch,EXCEL,MATHS,PYTHON,RPGM,STATS,age
0,1,XYZ,1994-01-01,M,BSCDS,2016,87,92,79,42,92,23
1,2,XYZ,1994-01-08,M,BSCDS,2017,96,102,89,77,104,23
2,3,XYZ,1994-01-15,M,BSCDS,2016,87,133,89,61,99,23
3,4,XYZ,1994-01-22,M,BSCDS,2017,90,82,71,67,100,23
5,6,XYZ,1994-02-05,M,BSCDS,2017,74,127,68,77,116,23
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,XYZ,1995-10-28,M,BTCSDA,2017,97,105,64,87,114,21
96,97,XYZ,1995-11-04,M,BTCSDA,2016,69,127,57,73,127,21
97,98,XYZ,1995-11-11,M,BTCSDA,2017,45,108,77,65,113,21
98,99,XYZ,1995-11-18,M,BTCSDA,2016,73,105,81,74,115,21


In [25]:
# Groups
genderGp = student.groupby('gender')

In [26]:
genderGp.aggregate('sum')  #numeric cols

Unnamed: 0_level_0,rollno,batch,EXCEL,MATHS,PYTHON,RPGM,STATS,age
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,739,30250,1144,1560,1174,1018,1642,329
M,4311,171400,6930,8709,6366,5901,9439,1871


In [32]:
genderGp[['EXCEL','age']].aggregate(['mean', 'sum'])


Unnamed: 0_level_0,EXCEL,EXCEL,age,age
Unnamed: 0_level_1,mean,sum,mean,sum
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
F,76.266667,1144,21.933333,329
M,81.529412,6930,22.011765,1871


In [33]:
genderGp.aggregate({'EXCEL': 'sum', 'age': 'mean'})

Unnamed: 0_level_0,EXCEL,age
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1144,21.933333
M,6930,22.011765


Unnamed: 0_level_0,rollno,name,dob,course,batch,EXCEL,MATHS,PYTHON,RPGM,STATS,age
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
F,5,XYZ,1994-01-29,BSCDS,2016,63,102,83,51,108,23
M,1,XYZ,1994-01-01,BSCDS,2016,87,92,79,42,92,23


In [12]:
print(student.groupby('gender')['STATS'].count())

gender
F    15
M    85
Name: STATS, dtype: int64


In [11]:
student[["PYTHON", 'STATS']].apply(
{ "PYTHON" : ["mean","median","max"],
  "STATS" : ["mean","median","max"]    
})

Unnamed: 0,PYTHON,STATS
mean,75.4,110.81
median,75.5,110.0
max,107.0,135.0


## First N items 

In [35]:
# first n items in the group
genderGp.first()   #first row in each group, no order, as per occurance

Unnamed: 0_level_0,rollno,name,dob,course,batch,EXCEL,MATHS,PYTHON,RPGM,STATS,age
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
F,5,XYZ,1994-01-29,BSCDS,2016,63,102,83,51,108,23
M,1,XYZ,1994-01-01,BSCDS,2016,87,92,79,42,92,23


In [42]:
# groupby
#df.sort_values('B').groupby('A').first()

In [None]:
# To get the full row, including missing values, is to use nth(0)
#df.sort_values('B').groupby('A').nth(0)

In [None]:
#df.sort_values('B', ascending=True).drop_duplicates('A', keep='first')
#sort method and then keep the first value using .drop_duplicates() with its keep argument set to 'first' (default). 
#This approach has the benefit that it keeps your index.

In [40]:
#if you want order to be preserved
#student.groupby(['gender', 'batch'], sort=True).agg({'age': 'mean'})
student.groupby('gender')['age'].rank()
#student[ student.groupby('gender')['age'].rank() == 2 ]

0     75.0
1     75.0
2     75.0
3     75.0
4     14.0
      ... 
95    10.5
96    10.5
97    10.5
98    10.5
99    10.5
Name: age, Length: 100, dtype: float64

## Rank

In [None]:
df[ df.groupby('A')['B'].rank(method='average') == 1 ]   # the default
df[ df.groupby('A')['B'].rank(method='min')     == 1 ]
df[ df.groupby('A')['B'].rank(method='first')   == 1 ]  

## Duplicates

In [None]:
df.drop_duplicates(subset='A')

## Sort

In [None]:
df.sort('A', inplace=True)

## Column Order
https://sparkbyexamples.com/pandas/pandas-change-the-order-of-columns/

In [None]:
#change_column = ['P','Q','R','S']
#df = df.reindex(columns=change_column)

## Group by: split-apply-combine
https://pandas.pydata.org/docs/user_guide/groupby.html