In [1]:
import pandas as pd
import numpy as np

In [2]:
# read the csv file and display the data frame
df_exams = pd.read_csv('StudentsPerformance.csv')
df_exams.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


## Select One or More Columns from a Data Frame

In [3]:
# not recommended way to select a column with a .
# fails if there is a space in the column's name 
# or if you want to select multiple columns

# col = df_exams.math score

In [4]:
# preferred way to select a column with a []
col = df_exams['math score']

# check type, index and head
print(f'type - {type(col)}\n')
print(f'index - {col.index}\n')
print(f'head - \n{col.head(3)}')

type - <class 'pandas.core.series.Series'>

index - RangeIndex(start=0, stop=1000, step=1)

head - 
0    72
1    69
2    90
Name: math score, dtype: int64


In [5]:
# select more than one column
cols = df_exams[['gender', 'math score', 'lunch']]

# check type, index and head
print(f'type - {type(cols)}\n')
print(f'index - {cols.index}\n')
print(f'head - \n{cols.head(3)}')

type - <class 'pandas.core.frame.DataFrame'>

index - RangeIndex(start=0, stop=1000, step=1)

head - 
   gender  math score     lunch
0  female          72  standard
1  female          69  standard
2  female          90  standard


## Add a new Column to the Data Frame

### Scalar

In [6]:
# add a new column with a scalar value
df_exams['language score'] = 70
df_exams.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,70
1,female,group C,some college,standard,completed,69,90,88,70
2,female,group B,master's degree,standard,none,90,95,93,70


### Array

In [7]:
# add a new column with an array

# create an array with the numbers from range 0 to 1000
arr1 = np.arange(0,1000)
print(f'First array - length: {len(arr1)}, min: {min(arr1)}, max: {max(arr1)}\n')

# create a random number array from 1 to 100
arr2 = np.random.randint(1, 100, size=1000)
print(f'Second array - length: {len(arr2)}, min: {min(arr2)}, max: {max(arr2)}\n')

# create a random float numbers between 1 and 10, rounded to 3rd decimal
arr3 = np.random.uniform(1, 10, size=10).round(3)
print(f'Third array - length: {len(arr3)}, min: {min(arr3)}, max: {max(arr3)}\n')

# create the column from the random numbers array
df_exams['language score'] = arr2
df_exams.head(3)

First array - length: 1000, min: 0, max: 999

Second array - length: 1000, min: 1, max: 99

Third array - length: 10, min: 1.355, max: 7.407



Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score
0,female,group B,bachelor's degree,standard,none,72,72,74,47
1,female,group C,some college,standard,completed,69,90,88,55
2,female,group B,master's degree,standard,none,90,95,93,56


### .assign()

In [8]:
# add a new column with assign()

# create two arrays with random numbers 
score1 = np.random.randint(1,100, size=1000)
score2 = np.random.randint(1,100, size=1000)

# create series from the arrays
series1 = pd.Series(score1, index=np.arange(0,1000))
series2 = pd.Series(score2, index=np.arange(0,1000))

# add the column to the data frame
df_exams = df_exams.assign(reliability=series1, procrastination=series2)
df_exams.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,reliability,procrastination
0,female,group B,bachelor's degree,standard,none,72,72,74,47,11,83
1,female,group C,some college,standard,completed,69,90,88,55,31,36
2,female,group B,master's degree,standard,none,90,95,93,56,10,93


When to use assign?

    - Add multiple columns in a single line of code.
    - Overwrite the values of an existing columns (best practice).

It returns a new object(a copy) with all the original columns in addition to the new ones.

### .insert()

In [9]:
# add a new column with insert()
df_exams.insert(1, 'insert at index 1', 'inserted')
df_exams.head(3)

Unnamed: 0,gender,insert at index 1,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,language score,reliability,procrastination
0,female,inserted,group B,bachelor's degree,standard,none,72,72,74,47,11,83
1,female,inserted,group C,some college,standard,completed,69,90,88,55,31,36
2,female,inserted,group B,master's degree,standard,none,90,95,93,56,10,93


## Rename a Column from the Data Frame

In [10]:
df_exams.rename(columns={'gender':'Gender'}, inplace=True)

In [11]:
df_exams.rename(columns={'math score':'MS', 'reading score':'RS', 'writing score':'RS'}).head(3)

Unnamed: 0,Gender,insert at index 1,race/ethnicity,parental level of education,lunch,test preparation course,MS,RS,RS.1,language score,reliability,procrastination
0,female,inserted,group B,bachelor's degree,standard,none,72,72,74,47,11,83
1,female,inserted,group C,some college,standard,completed,69,90,88,55,31,36
2,female,inserted,group B,master's degree,standard,none,90,95,93,56,10,93
