# Imports


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series

## DataFrame: 
1. two dimensional table of data with row indexes. The columns are made up of Series objects.
2. columns index (df.columns) is a list of strings.
3. row index (df.index) might be
   
   a. integers
   b. strings
   c. DatetimeIndex or PeriodIndex



## Series:

An ordered one dimensional array of data with an index. 
All data in series is of same type (just like np.array)
series arithmetic is vectorised after first aligning series index for each of operands

## Loading data in dataframe.

In [None]:

df = pd.read_csv('filename.csv')

df = pd.DataFrame(data) # -> data can be dictionary with column names as keys and 
# columns as values


## Saving a DataFrame


In [None]:

df.to_csv('filename.csv', encoding = 'utf-8')

dict = df.to_dict()  # -> to dictionary
str = df.to_string() # -> to string
mat = df.as_matrix() # -> to numpy matrix


## Working with dataframe

In [None]:
# General Info
df.info() # -> index and datatype
df.head(i) # -> first i no. of rows
df.tail(i) # -> get last i no. of rows
df.describe() # -> summary stats of columns

# Non Indexing attributes
df.T # -> Transpose rows and cols
df.axes # -> list row and col indexes 
        # row_index, col_index = df.axes
df.dtypes #-> Column data types
df.empty # -> True for empty dataframe
df.ndim # -> number of axes (most likely 2)
df.shape # -> (row_count, column count)
df.size # -> row count * column count
df.values # -> numpy array for df

# Utility Methods
df.copy() # copy a dataframe
df.rank() # rank each col
df.sort_values(by = column_name) # sorts the dataframe with respect to values of specified columns
df.sort_values(by = [col1, col2]) # sorts by multiple levels and tie breakers
df.sort_index() # Sorts by the index
df.astype(dtype) # type conversion

# Dataframe Iteration
df.iteritems() # (col_index, Series) pairs
df.iterrows() # (row_index, Series) pairs

# Statistical functions
df.abs() # absolute values
df.add(0) # add df, Series or value
df.count() # non NA/null values
df.cummax() # cumulative max
df.cummin() # cumulative min
df.cumsum() # cumulative sum
df.diff() # difference of dataframe element with respect to another element
df.div() # Element wise division by a number or other dataframe
df.dot() # dot product
df.max() # max of axis
df.mean() # mean 
df.median() # median 
df.min() # min of axis 
df.mul() # multiply by series value
df.sum() # sum of axis 

# Select rows, cols based on index label values
df.filter(items = ['a', 'b']) # by col
df.filter(items = [4], axis = 0) # by row
df.filter(like = 'x') # keep x in col
df.filter(regex = 'x') # regex in col
df.select(lambda x: not x % 5) # Select rows with index divisible by 5



## Working with columns

In [None]:
# Get column index and labels
idx = df.columns # -> get column index
label = df.columns[0]     #-> first col label
l = df.columns.tolit()     #-> List of column latels
a = df.column.values     #-> array of col labels

# Change column namges:
df = df.rename(columns = {'oldname': 'newname', 'oldname1': 'newname2'})
df.columns = ['newname1', 'newname1', 'newname3']

# Setting up columns:
df['colname']     #-> select column to series
df[['colname']]     #-> select column to df
df[['a', 'b']]     #-> select 2+ columns 
df = df[['c', 'b', 'a']]     #-> change the order of columns
df[df.columns[0]]     #-> select by column number
df[df.columns[[1, 2, 3]]]     #-> multiple columns by number
df = [df.columns[:-1]]     #-> all but last col
df.pop('column')     #-> drop column

# adding new columns
df['new_col'] = pd.Series([1, 2, 3, 4])   
df3 = df1.append(other = df2)     #-> combining two dataframes by vertical axis

# Removing columns
df = df.drop(['col1', 'col2'], axis = 1)
del df['colname']
df.drop(df.columns[0], axis = 1)     #-> removes first column
df.drop(df.columns[-1:], axis = 1)     #-> removes last column

# Vectorized arithmatic
df['newColumn'] = df['col1'] / df['col2']
df['newColumn'] = df['col3'] * 100 

# Column values manipulation
df['a'] = df['b'].where(df['a'] > 0, other = 0)
df['b'] = df['a'].where(df.b != 0, other = df.c)

# Column datatype conversion
df['col'].astype(str)     #-> string conversion
df['col'].values     #-> numpy array
df['col'].tolist()     #-> python list

# Column wide methods
df['col'].dtype     #-> The datatype 
df['col'].size     #-> Column dimensions
df['col'].count()     #-> non-NA count
df['col'].sum()     #-> sum of all values
df['col'].prod()     #-> product of all vals
df['col'].min()     #-> minimum of column
df['col'].max()     #-> maximum of column
df['col'].mean()     
df['col'].median()
df['col'].value_counts()

# finding index of column
label = df['col1'].idx

# Column element wise methods:
df['col'].isnull() # True if cell is empty else false
df['col'].notnull() # not isnull()
df['col'].astype(float) # Conversion into float
df['col'].abs() # Absolute value
df['col'].round(decimals = 0)
df['col'].diff(periods = 1)
df['col'].shift(periods = 1)
df['col'].to_datetime()
df['col'].fillna(0) # replaceing N/A values with 0
df['col'].cumsum() 
df['col'].cumprod()
df['col'].pct_change(periods = 4)
df['col'].rolling(window = 4, min_periods = 4, center = False).sum()


# Selecting multiple columns:
df.loc[:, 'col1':'col2'] # Inclusive
df.iloc[:, 3:6] # Exculsive

# Text if column index values are unique/monotonic
df.columns.is_monotonic_increasing
df.columns.is_monotonic_decreasing


## Working with rows

In [None]:
# Get row index and labels
idx = df.index  # get row index
label = df.index[0]  # first row label
label = df.index[-1] # Last row label
df.index.tolist() # get as list
df.index.values # get as an array

# Change the row index
df.index = idx
df = df.set_index('A') # Col A new index
df.reset_index() # reset the indices
df.index = range(len(df))
df.set_index(keys = ['r1', 'r2', 'r3'])
df.rename(index = {"old" : "new"}) 

# Boolean row selection
df[df['col'] >= 0.0] # -> selects all rows with values >= 0.0 in column col
df[(df['col1'] >= 1.0) | (df['col2'] < 3.0)] # -> double criterian
""" Each compareson should be encapsulated in round braces """
""" The boolean operators (&^~) has to be used instead of pythonic ones (and, or, not)"""
df[(df['col3'].isin([1, 2, 3, 7, 11]))] # -> targetting sepcific values
df[~(df['col3'].isin([1, 2, 3, 7, 11]))] # -> targetting values other than specified
df[(df['col4'].str.contains("hello"))] # -> targetting values with specific characterstics

# Selecting slice of rows by integer position
df = df[:] # => copy entire dataframe
df = df[0:2] # => select rows with index 0 and 1 (inclusive start and exclusive end)
df = df[2:3] # => select row with index 2
df = df[-1:] # => select the last row
df = df[:-1] # => select all but last row
df = df[::2] # => every second row

# Iterating over dataframe rows
for index, row in df.iterrows()

# Sorting 
df.sort(df.columns[0], ascending = False)
df.sort(['col1', 'col2'])
df.sort_index(ascending = False) # Sorting by the index values


""" Dropping duplicate rows """
df.drop_duplicates(cols = ['col1', 'col2'], take_last = True)



## Working with cells

In [None]:
# Selecting cell by row and column label
values = df.at['row', 'col']
values = df.loc['row', 'col']
values = df['col'].at['row'] # -> at[] is fastest label based scaler lookup and it's tricky to use

# Setting value of cell using row and column label
df.at['row', 'col'] = value
df.loc['row', 'col'] = value
df['col'].at['row'] = value # Tricky to use

# Selecting the slice of dataframe
df = df.loc[3:10 , 'col1':'col5']

# selecting by the integer position
df = df.iloc[23: 56, 1: 5]
df = df.iloc[: , 1: 5]
df = df.iloc[10 : 20, :]

""" The setting of new values can be done using assignment operators"""


## Selecting using dataframe index

In [None]:
# Selection of column
df['column_name'] # -> returns a series
df[['column_name']] # -> returns a dataframe
df[[col1, col2]] 
df[index] # -> selects cols with index

# Using dataframe index to select rows
df[from : to] # -> label slice
df[3 : 7] # -> integer slice
df[df['col'] > 5.0] # -> boolean series
df.loc[label] # -> single label


## Joining / Combineing files

Three ways to join / Combine dataframe
1. merge(database/SQL type operation)
2. concat(stack side by side or on top of each other)
3. combine_first(splice two together, selecting values from one over the other)

In [None]:
# merge 
new_df = pd.merge(left = df1,
                  right = df2,
                  how = 'outer',
                  left_index = True,
                  right_index = True)

new_df = pd.merge(left = df1,
                  right = df2,
                  how = 'left',
                  left_on = 'col1',
                  right_on = 'col2',)


In [None]:
# concat 
new_df = pd.concat([df1, df2], axis = 0) # stock on top of each other
new_df = pd.concat([df1, df2], axis = 1) # join side by side

In [None]:
# combine first 
df = df.combine_first(other = df2)