In [1]:
import pandas as pd
import numpy as np

In [2]:
# reading the csv file
df_exams = pd.read_csv('StudentsPerformance.csv')
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


## Select One Column from a Data Frame

### Option One

In [5]:
# preferred way to select a column with []
col1 = df_exams['gender']

In [6]:
# check type, index and head
type(col1)

pandas.core.series.Series

In [7]:
col1.index

RangeIndex(start=0, stop=1000, step=1)

In [8]:
col1.head()

0    female
1    female
2    female
3      male
4      male
Name: gender, dtype: object

### Option Two

In [10]:
# not recommended way to select a column with a .
col2 = df_exams.gender
col2.head()

0    female
1    female
2    female
3      male
4      male
Name: gender, dtype: object

In [11]:
# fails if there is a space in the column's name
# col2 = df_exams.math score

In [61]:
# select a column with a space in its name the proper way
col2 = df_exams['math score']
col2.head()

0    72
1    69
2    90
3    47
4    76
Name: math score, dtype: int64

## Select Two or More Columns from a Data Frame

In [14]:
# select more than one column
cols = df_exams[['gender', 'math score', 'lunch']]
cols.head()

Unnamed: 0,gender,math score,lunch
0,female,72,standard
1,female,69,standard
2,female,90,standard
3,male,47,free/reduced
4,male,76,standard


In [15]:
# checking type of the selection
type(cols)

pandas.core.frame.DataFrame

In [16]:
# selecting more than one column is not possible with .

## Add a new Column to the Data Frame

### Add a new Column with a Scalar Value

In [19]:
df_exams['language score'] = 70
df_exams.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,70
1,female,group C,some college,standard,completed,69,90,88,70
2,female,group B,master's degree,standard,none,90,95,93,70
3,male,group A,associate's degree,free/reduced,none,47,57,44,70
4,male,group C,some college,standard,none,76,78,75,70


### Add a new Column with an Array

In [21]:
# create an array with 1000 elements
language_score = np.arange(0,1000)

In [22]:
# check its length
len(language_score)

1000

In [23]:
# add a new column to the Data Frame with an Array
df_exams['language score'] = language_score
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,0
1,female,group C,some college,standard,completed,69,90,88,1
2,female,group B,master's degree,standard,none,90,95,93,2
3,male,group A,associate's degree,free/reduced,none,47,57,44,3
4,male,group C,some college,standard,none,76,78,75,4
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,995
996,male,group C,high school,free/reduced,none,62,55,55,996
997,female,group C,high school,free/reduced,completed,59,71,65,997
998,female,group D,some college,standard,completed,68,78,77,998


In [24]:
# create a random integer number array from 1 to 100
int_language_score = np.random.randint(1, 100, size=1000)

In [25]:
# check min/max e;ements
print(min(int_language_score))
print(max(int_language_score))

1
99


In [26]:
# add a new column to the Data Frame with an Array
df_exams['language score'] = int_language_score
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,35
1,female,group C,some college,standard,completed,69,90,88,90
2,female,group B,master's degree,standard,none,90,95,93,20
3,male,group A,associate's degree,free/reduced,none,47,57,44,71
4,male,group C,some college,standard,none,76,78,75,68
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,95
996,male,group C,high school,free/reduced,none,62,55,55,30
997,female,group C,high school,free/reduced,completed,59,71,65,55
998,female,group D,some college,standard,completed,68,78,77,59


In [27]:
# create a random float numbers between 1 and 10
np.random.uniform(1, 10, size=10)

array([3.74997732, 5.78856075, 7.3490708 , 6.60196432, 1.9198916 ,
       7.26575942, 4.95784058, 7.04535079, 6.96545996, 5.14647815])

### Add a new Column with assign()

WHen to use assign?

    - Add multiple columns in a single line of code.
    - Overwrite the values of an existing columns (best practice).

It returns a new object(a copy) with all the original columns in addition to the new ones.

In [30]:
# create two arrays with random numbers to be inserted as columns in the Data Frame
score1 = np.random.randint(1,100, size=1000)
score2 = np.random.randint(1,100, size=1000)

In [31]:
# create Series from the arrays
series1 = pd.Series(score1, index=np.arange(0,1000))
series2 = pd.Series(score2, index=np.arange(0,1000))

In [32]:
df_exams = df_exams.assign(reliability=series1, procrastination=series2)
df_exams

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,reliability,procrastination
0,female,group B,bachelor's degree,standard,none,72,72,74,35,95,11
1,female,group C,some college,standard,completed,69,90,88,90,59,43
2,female,group B,master's degree,standard,none,90,95,93,20,17,52
3,male,group A,associate's degree,free/reduced,none,47,57,44,71,85,58
4,male,group C,some college,standard,none,76,78,75,68,32,41
...,...,...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,95,67,82
996,male,group C,high school,free/reduced,none,62,55,55,30,90,83
997,female,group C,high school,free/reduced,completed,59,71,65,55,56,14
998,female,group D,some college,standard,completed,68,78,77,59,11,18


### Add a new Column with insert()

Inserts a new column at a specific position/index.

In [35]:
# add a new column at a specific position with insert to the Data Frame
df_exams.insert(1, 'insert at index 1', series1)
df_exams

Unnamed: 0,gender,insert at index 1,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,reliability,procrastination
0,female,95,group B,bachelor's degree,standard,none,72,72,74,35,95,11
1,female,59,group C,some college,standard,completed,69,90,88,90,59,43
2,female,17,group B,master's degree,standard,none,90,95,93,20,17,52
3,male,85,group A,associate's degree,free/reduced,none,47,57,44,71,85,58
4,male,32,group C,some college,standard,none,76,78,75,68,32,41
...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,67,group E,master's degree,standard,completed,88,99,95,95,67,82
996,male,90,group C,high school,free/reduced,none,62,55,55,30,90,83
997,female,56,group C,high school,free/reduced,completed,59,71,65,55,56,14
998,female,11,group D,some college,standard,completed,68,78,77,59,11,18
