In [68]:
import pandas as pd
import numpy as np

## Create dataframe from array
### 1.1 Option 1

In [69]:
data = np.array([[1, 4], [2, 5], [3, 6]])
df = pd.DataFrame(data, index=['row1', 'row2', 'row3'], columns=['col1', 'col2'])
df

Unnamed: 0,col1,col2
row1,1,4
row2,2,5
row3,3,6


### 1.2 Option 2 (from List)

In [70]:
data = [[1, 4], [2, 5], [3, 6]]
df = pd.DataFrame(data, index=['row1', 'row2', 'row3'], columns=['col1', 'col2'])
df

Unnamed: 0,col1,col2
row1,1,4
row2,2,5
row3,3,6


## Creating a DataFrame from dict

In [71]:
states = ['California', 'Texus', 'Florida', 'NY']
population = [123445, 12311, 54564, 453453]

# store list within a dict
dict_states = {'states': states, 'population': population}

# Create the dataframe
df_population = pd.DataFrame(dict_states)

# print dataframe
df_population

Unnamed: 0,states,population
0,California,123445
1,Texus,12311
2,Florida,54564
3,NY,453453


## Creating Dataframe from CSV

In [72]:
# reading the csv file
df_exams = pd.read_csv('test.csv')

# show first 5 rows in the DF
df_exams

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score
0,female,group b,some college,standard,completed,45,34,56
1,female,group b,some college,standard,completed,45,34,56
2,male,group c,some college,standard,completed,85,46,32
3,male,group d,some uni,standard,completed,85,46,32
4,male,group e,high school,standard,completed,85,46,32
5,male,group f,uni,standard,completed,85,46,32


In [73]:
# show first 5 rows in DF
df_exams.head()


Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score
0,female,group b,some college,standard,completed,45,34,56
1,female,group b,some college,standard,completed,45,34,56
2,male,group c,some college,standard,completed,85,46,32
3,male,group d,some uni,standard,completed,85,46,32
4,male,group e,high school,standard,completed,85,46,32


In [74]:
# show last 5 rows in DF
df_exams.tail(2)

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score
4,male,group e,high school,standard,completed,85,46,32
5,male,group f,uni,standard,completed,85,46,32


# Attribute

In [75]:
# display N rows
# pd.set_option('display.max_rows', 7)
# df_exams

In [76]:
# getting access to the shape attribute
df_exams.shape

(6, 8)

In [77]:
# getting access to the index attribute
df_exams.index

RangeIndex(start=0, stop=6, step=1)

In [78]:
# getting access to column attribute
df_exams.columns

Index(['gender', 'race', 'parental level', 'lunch', 'test preparation',
       'math score', 'reading score', 'writing score'],
      dtype='object')

In [79]:
# data types of column
df_exams.dtypes

gender            object
race              object
parental level    object
                   ...  
math score         int64
reading score      int64
writing score      int64
Length: 8, dtype: object

# Methods

In [80]:
# showing the info of the dataframe
df_exams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   gender            6 non-null      object
 1   race              6 non-null      object
 2   parental level    6 non-null      object
 3   lunch             6 non-null      object
 4   test preparation  6 non-null      object
 5   math score        6 non-null      int64 
 6   reading score     6 non-null      int64 
 7   writing score     6 non-null      int64 
dtypes: int64(3), object(5)
memory usage: 516.0+ bytes


In [81]:
# describe basic statistics of the dataframe
df_exams.describe()

Unnamed: 0,math score,reading score,writing score
count,6.000000,6.000000,6.000000
mean,71.666667,42.000000,40.000000
std,20.655911,6.196773,12.393547
...,...,...,...
50%,85.000000,46.000000,32.000000
75%,85.000000,46.000000,50.000000
max,85.000000,46.000000,56.000000


# Functions

In [82]:
# the highest index of the dataframe
max(df_exams.index)

5

In [83]:
min(df_exams.index)

0

In [84]:
# the length of the dataframe(no of rows)
len(df_exams)

6

In [85]:
# data type
type(df_exams)

pandas.core.frame.DataFrame

In [86]:
# rounding the values of the dataset
round(df_exams, 2)

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score
0,female,group b,some college,standard,completed,45,34,56
1,female,group b,some college,standard,completed,45,34,56
2,male,group c,some college,standard,completed,85,46,32
3,male,group d,some uni,standard,completed,85,46,32
4,male,group e,high school,standard,completed,85,46,32
5,male,group f,uni,standard,completed,85,46,32


# Added new column with an array

In [87]:
language_score = np.arange(0, 6)

In [88]:
# length of the array
len(language_score)

6

In [89]:
# adding a new column to dataframe with an array
df_exams['language score'] = language_score
df_exams

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score,language score
0,female,group b,some college,standard,completed,45,34,56,0
1,female,group b,some college,standard,completed,45,34,56,1
2,male,group c,some college,standard,completed,85,46,32,2
3,male,group d,some uni,standard,completed,85,46,32,3
4,male,group e,high school,standard,completed,85,46,32,4
5,male,group f,uni,standard,completed,85,46,32,5


In [90]:
# create random integer number between 1 to 100
int_language_score = np.random.randint(1, 100, size=6)

In [91]:
# min value inclusive and max value exclusive
min(int_language_score)
max(int_language_score)

95

In [92]:
# adding a new column to dataframe with an array
df_exams['language score'] = int_language_score
df_exams

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score,language score
0,female,group b,some college,standard,completed,45,34,56,52
1,female,group b,some college,standard,completed,45,34,56,63
2,male,group c,some college,standard,completed,85,46,32,78
3,male,group d,some uni,standard,completed,85,46,32,95
4,male,group e,high school,standard,completed,85,46,32,25
5,male,group f,uni,standard,completed,85,46,32,50


In [93]:
# create random float numbers between 1 to 100
np.random.uniform(1, 100, size=6)

array([31.3940807 , 12.19020183, 90.17342984, 33.9578896 , 83.94282104,
       69.76685686])

# Math operations
## 1.1 Operations in column

In [95]:
# 1.1 Operations in columns
# sum
df_exams['math score'].sum()

430

In [96]:
# count, mean, std, max, and min
df_exams['math score'].count()
df_exams['math score'].mean()
df_exams['math score'].max()
df_exams['math score'].min()

45

In [97]:
# easier calculation with .describe()
df_exams.describe()

Unnamed: 0,math score,reading score,writing score,language score
count,6.000000,6.000000,6.000000,6.000000
mean,71.666667,42.000000,40.000000,60.500000
std,20.655911,6.196773,12.393547,24.271382
...,...,...,...,...
50%,85.000000,46.000000,32.000000,57.500000
75%,85.000000,46.000000,50.000000,74.250000
max,85.000000,46.000000,56.000000,95.000000


## Operations in rows

In [98]:
# calculating the sum in rows
df_exams['math score'] + df_exams['reading score'] + df_exams['writing score']

0    135
1    135
2    163
3    163
4    163
5    163
dtype: int64

In [99]:
# calculating the average score and assigning the result to a new column
df_exams['average'] = (df_exams['math score'] + df_exams['reading score'] + df_exams['writing score']) / 3
df_exams

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score,language score,average
0,female,group b,some college,standard,completed,45,34,56,52,45.0
1,female,group b,some college,standard,completed,45,34,56,63,45.0
2,male,group c,some college,standard,completed,85,46,32,78,54.333333
3,male,group d,some uni,standard,completed,85,46,32,95,54.333333
4,male,group e,high school,standard,completed,85,46,32,25,54.333333
5,male,group f,uni,standard,completed,85,46,32,50,54.333333


In [100]:
# showing the dataframe
df_exams.round(2)

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score,language score,average
0,female,group b,some college,standard,completed,45,34,56,52,45.0
1,female,group b,some college,standard,completed,45,34,56,63,45.0
2,male,group c,some college,standard,completed,85,46,32,78,54.33
3,male,group d,some uni,standard,completed,85,46,32,95,54.33
4,male,group e,high school,standard,completed,85,46,32,25,54.33
5,male,group f,uni,standard,completed,85,46,32,50,54.33


## Value counts

In [101]:
# counting gender element
df_exams['gender'].count()

6

In [102]:
# counting gender element by category
df_exams['gender'].value_counts()

gender
male      4
female    2
Name: count, dtype: int64

In [103]:
# return the relative frequency(divide all values by the sum of values)
df_exams['gender'].value_counts(normalize=True)

gender
male      0.666667
female    0.333333
Name: proportion, dtype: float64

In [104]:
# counting parental level elements by category
df_exams['parental level'].value_counts()

parental level
 some college    3
 some uni        1
 high school     1
 uni             1
Name: count, dtype: int64

In [105]:
# return the relative frequency and round to decimals
df_exams['parental level'].value_counts(normalize=True).round(2)

parental level
 some college    0.50
 some uni        0.17
 high school     0.17
 uni             0.17
Name: proportion, dtype: float64

# Sort a DataFrame

In [108]:
# sort by one column
df_exams.sort_values(by='math score', ascending=False)

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score,language score,average
2,male,group c,some college,standard,completed,85,46,32,78,54.333333
3,male,group d,some uni,standard,completed,85,46,32,95,54.333333
4,male,group e,high school,standard,completed,85,46,32,25,54.333333
5,male,group f,uni,standard,completed,85,46,32,50,54.333333
0,female,group b,some college,standard,completed,45,34,56,52,45.0
1,female,group b,some college,standard,completed,45,34,56,63,45.0


In [109]:
# sort descending by multiple columns
df_exams.sort_values(by=['math score', 'reading score'], ascending=False)

Unnamed: 0,gender,race,parental level,lunch,test preparation,math score,reading score,writing score,language score,average
2,male,group c,some college,standard,completed,85,46,32,78,54.333333
3,male,group d,some uni,standard,completed,85,46,32,95,54.333333
4,male,group e,high school,standard,completed,85,46,32,25,54.333333
5,male,group f,uni,standard,completed,85,46,32,50,54.333333
0,female,group b,some college,standard,completed,45,34,56,52,45.0
1,female,group b,some college,standard,completed,45,34,56,63,45.0
