Import libraries

In [2]:
import pandas as pd
import numpy as np

## Data Frame Creation

In [3]:
numbers = np.arange(20).reshape(5,4)
print(numbers)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]]


Create a new Dataframe from the ***numbers*** array

In [4]:
df = pd.DataFrame(numbers)
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


### columns
Add column names

In [5]:
df = pd.DataFrame(numbers, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


### index
Replacing the default index

In [7]:
days = ['Sun','Mon','Tue','Wed','Thu']
sale_ids = range(100, 150, 10)

numbers = np.random.randint(1000,9999,(5,5))

sales = pd.DataFrame(data=numbers, index=sale_ids, columns=days)
sales

Unnamed: 0,Sun,Mon,Tue,Wed,Thu
100,3213,8015,8480,8345,2847
110,5392,1313,9701,1094,3011
120,6617,1603,9450,1231,5986
130,5786,7986,6299,2058,1255
140,3012,6214,9526,8732,6135


### head / tail
We can display top / bottom rows using head / tail (default is 5)

In [8]:
sales.head(3)

Unnamed: 0,Sun,Mon,Tue,Wed,Thu
100,3213,8015,8480,8345,2847
110,5392,1313,9701,1094,3011
120,6617,1603,9450,1231,5986


In [9]:
sales.tail(2)

Unnamed: 0,Sun,Mon,Tue,Wed,Thu
130,5786,7986,6299,2058,1255
140,3012,6214,9526,8732,6135


## Creating Data Frame from a CSV file
Import a CSV file into a Dataframe

In [2]:
df = pd.read_csv('athlete_events.csv')
df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


### usecols
Get only specific columns from file. usecols must get column names as a list, even if it's a single value.

In [4]:
df = pd.read_csv('athlete_events.csv', usecols=['ID', 'Name', 'Sex', 'Age', 'Team', 'Sport'])
df.head()

Unnamed: 0,ID,Name,Sex,Age,Team,Sport
0,1,A Dijiang,M,24.0,China,Basketball
1,2,A Lamusi,M,23.0,China,Judo
2,3,Gunnar Nielsen Aaby,M,24.0,Denmark,Football
3,4,Edgar Lindenau Aabye,M,34.0,Denmark/Sweden,Tug-Of-War
4,5,Christine Jacoba Aaftink,F,21.0,Netherlands,Speed Skating


### index_col
Define a specific column as the Dataframe's index. 

In [7]:
df = pd.read_csv('athlete_events.csv', usecols=['ID', 'Name', 'Sex', 'Age', 'Team', 'Sport'], index_col='ID')
df.head()

Unnamed: 0_level_0,Name,Sex,Age,Team,Sport
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,A Dijiang,M,24.0,China,Basketball
2,A Lamusi,M,23.0,China,Judo
3,Gunnar Nielsen Aaby,M,24.0,Denmark,Football
4,Edgar Lindenau Aabye,M,34.0,Denmark/Sweden,Tug-Of-War
5,Christine Jacoba Aaftink,F,21.0,Netherlands,Speed Skating


### Export data to a new CSV file

In [None]:
df = df.head()
df.to_csv('New_athlete_data.csv', sep=',')

### columns
Get only Column names

In [8]:
df.columns

Index(['Name', 'Sex', 'Age', 'Team', 'Sport'], dtype='object')

### index
Get all the values in the index of the Dataframe

In [9]:
df.index

Int64Index([     1,      2,      3,      4,      5,      5,      5,      5,
                 5,      5,
            ...
            135565, 135566, 135567, 135567, 135568, 135569, 135570, 135570,
            135571, 135571],
           dtype='int64', name='ID', length=271116)

### axes
Get both index labels and column labels

In [10]:
df.axes

[Int64Index([     1,      2,      3,      4,      5,      5,      5,      5,
                  5,      5,
             ...
             135565, 135566, 135567, 135567, 135568, 135569, 135570, 135570,
             135571, 135571],
            dtype='int64', name='ID', length=271116),
 Index(['Name', 'Sex', 'Age', 'Team', 'Sport'], dtype='object')]

### shape
Get the dimensions of the dataframe

In [6]:
df.shape

(271116, 15)

### size
Get the total number of positions (values and nulls) within a dataframe

In [8]:
df.size

4066740