In [1]:
import pandas as pd

## Loading CSV files

In [None]:
data_frame = pd.read_csv('file.csv', sep=';')
pd.read_csv('file.csv', sep=';', skipinitialspace=True)
pd.read_csv('file.csv', sep='\s*;\s*', skipinitialspace=True, engine='python')

## Hardcored Datagrames

In [None]:
import numpy as np

df = pandas.DataFrame({
   'col1': ['Item0', 'Item0', 'Item1', 'Item1'],
   'col2': ['Gold', 'Bronze', 'Gold', 'Silver'],
   'col3': [1, 2, np.nan, 4]
})

## Previewing Data

In [2]:
# Displays the top 5 rows. Accepts an optional int parameter - num. of rows to show
df.head()

# Similar to head, but displays the last rows
df.tail()

# The dimensions of the dataframe as a (rows, cols) tuple
df.shape

# The number of columns. Equal to df.shape[0]
len(df) 

# An array of the column names
df.columns 

# Columns and their types
df.dtypes

# Converts the frame to a two-dimensional table
df.values 

# Displays descriptive stats for all columns
df.describe()

NameError: name 'df' is not defined

## Sorting

In [None]:
# Sort rows descendingly by the index
df.sort_index(axis=0, ascending=False)
df.sort_values(by=['col2', 'col1'], ascending=False)

## Selecting / Querying

In [None]:
# Selects only the column named 'col1';
df.col1 

# Same as previous
df['col1'] 

# Select two columns
df[['col1', 'col2']]

In [None]:
# Selects second row
df.iloc[1]
# Selects rows 1-to-3
df.iloc[1:3]
# First row, first column
df.iloc[0,0]
# First 4 rows and first 2 columns
df.iloc[0:4, 0:2]

In [None]:
# Produces and array, not a single value!
df.col3 > 0
# Query by a single column value
df[df.col3 > 0] 

# Query by a single column, if it is in a list of predefined values
df[df['col2'].isin(['Gold', 'Silver'])] 

# A conjunction query using two columns
df[(df['col3'] > 0) & (df['col2'] == 'Silver')] 

# A disjunction query using two columns
df[(df['col3'] > 0) | (df['col2'] == 'Silver')]

# A query checking the textual content of the cells
df[df.col2.str.contains('ilver')]

## Modifying Data Frames

In [None]:
# Modifies the cell identified by its row index and column name
df.at[1, 'col2'] = 'Bronze and Gold' 

# Modifies the cell identified by its absolute row and column indices
df.iat[1,1] = 'Bronze again' 

# Replaces the column with the array. It could be a numpy array or a simple list.
#Could also be used to create new columns
df.loc[:,'col3'] = ['Unknown'] * len(df) 

# Equivalent to the previous
df.col3 = ['Unknown'] * len(df) 

# Removes all rows with any missing values.
df.dropna(how='any') 

# Removes all rows with all missing values.
df.dropna(how='all')

In [None]:
def f(x):
    return x + ' New Column';

# Uses the unary function f to create a new column based on an existing one
df.col4 = f(df.col3) 

def g(x, y):
    return x + '_' + y

# Uses the 2-arg function g to create a new column based on 2 existing columns
df.col4 = g(df.col3, df.col2)

## Dates and time

In [None]:
dates_df = pandas.read_csv('test.csv', sep=';', parse_dates=['col1', 'col2'])

In [None]:
def custom_parser(s):
    # Specify the non-standard format you need
    return pandas.datetime.strptime(s, '%d%b%Y')

dates_df = pandas.read_csv('test.csv', sep=';', parse_dates=['col1'], date_parser=custom_parser)

In [None]:
dates_df['col2'] = pandas.to_datetime(dates_df['col2'], format='%d.%m.%Y')

In [None]:
# Unit specifies if the time is in seconds('s'), millis ('ms'), nanos('ns') etc.
dates_df['col'] = pandas.to_datetime(dates_df['col'], unit='ms')

In [None]:
def timestamp_parser(n):
    # Specify the unit you need
    return pandas.to_datetime(float(n), unit='ms')

dates_df = pandas.read_csv('test.csv', sep=';', parse_dates=['col1'], date_parser=timestamp_parser)

In [None]:
# Creates a new numeric column with the timestamp epoch in nanos
dates_df.col4 = pandas.to_numeric(dates_df.col3)

## Plotting

In [None]:
import matplotlib
import matplotlib.pyplot as plt

In [None]:
# Will allow us to embed images in the notebook
%matplotlib inline

In [None]:
plot_df = pandas.DataFrame({
    'col1': [1, 3, 2, 4],
    'col2': [3, 6, 5, 1],
    'col3': [4, 7, 6, 2],
})

plot_df.plot()

In [None]:
plot_df.plot(x='col1')

In [None]:
# Use kind='hbar' for horizontal bars, and stacked=True to stack the groups
plot_df.plot(kind='bar')

In [None]:
plot_df.plot(kind='box')