In [1]:
import pandas as pd

In [2]:
#reading the csv file
df_exams = pd.read_csv('StudentsPerformance.csv')
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# 1 Selecting One Column

In [3]:
#select a colum with []
df_exams['gender']

0      female
1      female
2      female
3        male
4        male
        ...  
995    female
996      male
997    female
998    female
999    female
Name: gender, Length: 1000, dtype: object

In [None]:
#check the data type of a column
type(df_exams['gender'])

In [5]:
df_exams['gender'].index
df_exams['gender'].head()

0    female
1    female
2    female
3      male
4      male
Name: gender, dtype: object

In [None]:
#select a colum with . (pitfalls)
df_exams.math_score

In [6]:
#select the same column using []
df_exams['math score']

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

# Select two or more columns 

In [7]:
#select 2 column using [[]]
df_exams[['gender', 'math score']]

Unnamed: 0,gender,math score
0,female,72
1,female,69
2,female,90
3,male,47
4,male,76
...,...,...
995,female,88
996,male,62
997,female,59
998,female,68


In [11]:
#select 2 or more columns using [[]]
df_exams[['gender', 'writing score', 'math score', 'reading score']]

Unnamed: 0,gender,writing score,math score,reading score
0,female,74,72,72
1,female,88,69,90
2,female,93,90,95
3,male,44,47,57
4,male,75,76,78
...,...,...,...,...
995,female,95,88,99
996,male,55,62,55
997,female,65,59,71
998,female,77,68,78


# Adding a new column to a scalar value

In [12]:
# adding a new colum with a scalar value
df_exams['language score'] = 70
df_exams 

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,70
1,female,group C,some college,standard,completed,69,90,88,70
2,female,group B,master's degree,standard,none,90,95,93,70
3,male,group A,associate's degree,free/reduced,none,47,57,44,70
4,male,group C,some college,standard,none,76,78,75,70
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,70
996,male,group C,high school,free/reduced,none,62,55,55,70
997,female,group C,high school,free/reduced,completed,59,71,65,70
998,female,group D,some college,standard,completed,68,78,77,70


# Adding a new column with an array

In [13]:
import numpy as np

In [16]:
#create an array of 1000 elements
language_score = np.arange(0, 1000)

In [None]:
len(language_score)

In [17]:
df_exams['language score'] = language_score
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,0
1,female,group C,some college,standard,completed,69,90,88,1
2,female,group B,master's degree,standard,none,90,95,93,2
3,male,group A,associate's degree,free/reduced,none,47,57,44,3
4,male,group C,some college,standard,none,76,78,75,4
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,995
996,male,group C,high school,free/reduced,none,62,55,55,996
997,female,group C,high school,free/reduced,completed,59,71,65,997
998,female,group D,some college,standard,completed,68,78,77,998


In [19]:
#create random integer numbers between 1 and 100
int_language_score = np.random.randint(1, 100, size = 1000)

In [20]:
df_exams['language score'] = int_language_score
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,30
1,female,group C,some college,standard,completed,69,90,88,66
2,female,group B,master's degree,standard,none,90,95,93,63
3,male,group A,associate's degree,free/reduced,none,47,57,44,90
4,male,group C,some college,standard,none,76,78,75,77
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,51
996,male,group C,high school,free/reduced,none,62,55,55,89
997,female,group C,high school,free/reduced,completed,59,71,65,82
998,female,group D,some college,standard,completed,68,78,77,67


# 1 MATH Operations

## Operations in Columns

In [21]:
#select a column and calculate total
df_exams['math score'].sum()

66089

In [27]:
# count, mean, std, max, and min
df_exams['math score'].count()
df_exams['math score'].mean()
df_exams['math score'].std()
df_exams['math score'].max()
#df_exams['math score'].min()

100

In [28]:
#easier using .describe()
df_exams.describe()

Unnamed: 0,math score,reading score,writing score,language score
count,1000.0,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054,49.785
std,15.16308,14.600192,15.195657,28.492957
min,0.0,17.0,10.0,1.0
25%,57.0,59.0,57.75,25.0
50%,66.0,70.0,69.0,50.0
75%,77.0,79.0,79.0,74.0
max,100.0,100.0,100.0,99.0


## Operations in rows

In [29]:
# calculating the sum in a row
df_exams['math score'] + df_exams['reading score'] + df_exams['writing score']

0      218
1      247
2      278
3      148
4      229
      ... 
995    282
996    172
997    195
998    223
999    249
Length: 1000, dtype: int64

In [33]:
# calulating the average score
df_exams['average'] = (df_exams['math score'] + df_exams['reading score'] + df_exams['writing score'])/3
df_exams.round(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,30,72.67
1,female,group C,some college,standard,completed,69,90,88,66,82.33
2,female,group B,master's degree,standard,none,90,95,93,63,92.67
3,male,group A,associate's degree,free/reduced,none,47,57,44,90,49.33
4,male,group C,some college,standard,none,76,78,75,77,76.33
...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,51,94.00
996,male,group C,high school,free/reduced,none,62,55,55,89,57.33
997,female,group C,high school,free/reduced,completed,59,71,65,82,65.00
998,female,group D,some college,standard,completed,68,78,77,67,74.33


# Value Counts

In [34]:
# counting gender elements
#len function
len(df_exams['gender'])
# .count() method
df_exams['gender'].count()


1000

In [35]:
df_exams['gender'].value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [36]:
df_exams['gender'].value_counts(normalize=True)

gender
female    0.518
male      0.482
Name: proportion, dtype: float64

In [37]:
#counting "Parental level of education elements by category"
df_exams['parental level of education'].value_counts()

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

In [38]:
df_exams['parental level of education'].value_counts(normalize=True).round(2)

parental level of education
some college          0.23
associate's degree    0.22
high school           0.20
some high school      0.18
bachelor's degree     0.12
master's degree       0.06
Name: proportion, dtype: float64