In [1]:
import pandas as pd
import numpy as np

 
 Pandas comes with a wide variaty of methods to load data from different data sources.
 
 We will try to go over all avaialbe Input methods.
 
 
 We will start with the most used one.
 
 ## Loading data from Flat files
 
 #### CSV files

In [4]:
# read_csv - Read a comma-separated values (csv) file into DataFrame.

users_df = pd.read_csv('DataSources\my_csv.csv')

users_df


Unnamed: 0,Name,Age
0,Adrian,33
1,Julia,21


In [5]:
# read_csv and assign the index column

users_df = pd.read_csv('DataSources\my_csv_idx.csv',index_col=0)

users_df



Unnamed: 0_level_0,Name,Age
Idx,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adrian,33
2,Julia,21


#### Reading data from JSON

In [6]:
# Read from a json file 
users_df = pd.read_json('DataSources\my_json.json')

users_df


Unnamed: 0,Name,Age
0,Adrian,33
1,Julia,21


#### Reading data from a Tab delimited file

In [7]:
# Read from a Tab delimited file 
users_df = pd.read_csv('DataSources\my_tab_delim.txt', sep='\t')

users_df


Unnamed: 0,Name Age
0,Adrian 33
1,Julia 21


In [8]:
# We can do the same using read_table() function, this function is similar to read_csv() but it has the default sep parameter 
# set to '\t'
# Read from a Tab delimited file 
users_df = pd.read_table('DataSources\my_tab_delim.txt')

users_df


Unnamed: 0,Name Age
0,Adrian 33
1,Julia 21


#### Reading data from a file with a custom delimter

In [9]:
# Read from a Custom delimited file
## our delimiter in this case is @
users_df = pd.read_csv('DataSources\my_custom_delim.txt', sep='@')

users_df


Unnamed: 0,Name,Age
0,Adrian,33
1,Julia,21


#### Reading data from a file with quoted values

In [10]:
# Reading data from a file with quoted values
## the option quoting supports the following values f QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3)
users_df = pd.read_csv('DataSources\my_quoted_data.txt', quoting=0)

users_df


Unnamed: 0,Name,Age
0,Adrian,33
1,Julia,21


#### Reading data from an Excel file

In [23]:
# this function we will have to have xlrd module installed 
# pip3 install xlrd
users_df = pd.read_excel('DataSources\my_excel.xlsx')

users_df

# By default this will return only the sheet with id 0 (sheet_name)

# to read sheet2 we can use
pd.read_excel('DataSources\my_excel.xlsx',sheet_name=1)

# Or if we wanna use the name of the sheet 
pd.read_excel('DataSources\my_excel.xlsx','Sheet2')


Unnamed: 0,Age
0,21
1,14
2,45
3,34
4,22


#### Reading data from an HTML file

In [38]:
url = 'https://www.contextures.com/xlSampleData01.html'
users_df = pd.read_html(url,header=0)
pd.set_option('display.max_rows', None)
users_df[0]

Unnamed: 0,OrderDate,Region,Rep,Item,Units,UnitCost,Total
0,1/6/2019,East,Jones,Pencil,95,1.99,189.05
1,1/23/2019,Central,Kivell,Binder,50,19.99,999.5
2,2/9/2019,Central,Jardine,Pencil,36,4.99,179.64
3,2/26/2019,Central,Gill,Pen,27,19.99,539.73
4,3/15/2019,West,Sorvino,Pencil,56,2.99,167.44
5,4/1/2019,East,Jones,Binder,60,4.99,299.4
6,4/18/2019,Central,Andrews,Pencil,75,1.99,149.25
7,5/5/2019,Central,Jardine,Pencil,90,4.99,449.1
8,5/22/2019,West,Thompson,Pencil,32,1.99,63.68
9,6/8/2019,East,Jones,Binder,60,8.99,539.4


In [7]:
# Or another sinple example 
pd.read_html('https://www.sigmaaldrich.com/technical-documents/articles/biology/periodic-table-of-elements-names.html', header=0)[1]

Unnamed: 0,Element Name,Symbol,Atomic Number,Electronegativity (χ)
0,Actinium,Ac,89,1.10
1,Aluminum,Al,13,1.61
2,Americium,Am,95,1.30
3,Antimony,Sb,51,2.05
4,Argon,Ar,18,
...,...,...,...,...
113,Xenon,Xe,54,2.60
114,Ytterbium,Yb,70,
115,Yttrium,Y,39,1.22
116,Zinc,Zn,30,1.65


#### Reading data from an Parquet file

In [11]:
# We can read the contents of parquet file using the read_parquet() function
# we need to have pyarrow or fastparquet
# since i am running on windowzz we will use fastparquet.

####  pip install fastparquet 

users_df = pd.read_parquet('DataSources\my_parquet.parquet', engine='pyarrow')
users_df = pd.read_parquet('DataSources\my_parquet.parquet', engine='fastparquet')


users_df

ImportError: Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.

#### Reading data from an ORC file

In [3]:
# This is not like Orc from Load of the Rings 
# ORC is an Optimized Row Columnar file

user_df = pd.read_orc('DataSources\my_orc')

user_df

ModuleNotFoundError: No module named 'pyarrow'

#### Reading Data from a Database

probably one of the most used function to read data into a dataframe

In [10]:
# before we go ahead we need to have pymysql installed 

### !pip install pymysql

# import module 
import pymysql

# establish connection 
connection = pymysql.connect(user='root', password='root', database='information_schema', host='localhost')

query = "SELECT table_schema,table_name FROM information_schema.tables"
my_df = pd.read_sql(query, connection)
connection.close()

my_df


Unnamed: 0,table_schema,table_name
0,information_schema,ALL_PLUGINS
1,information_schema,APPLICABLE_ROLES
2,information_schema,CHARACTER_SETS
3,information_schema,CHECK_CONSTRAINTS
4,information_schema,COLLATIONS
...,...,...
176,performance_schema,socket_instances
177,performance_schema,socket_summary_by_instance
178,performance_schema,socket_summary_by_event_name
179,performance_schema,session_connect_attrs


In [25]:
# We can also have parameters passed to the query while running the read_sql() function 
import pymysql

# establish connection 
connection = pymysql.connect(user='root', password='root', database='information_schema', host='localhost')

# Build Query using the string substitute placeholder %s
query = "SELECT table_schema,table_name FROM information_schema.tables where table_schema = %s"

# create DataFrame 
my_df = pd.read_sql(query, connection, params={'mysql'})
                    
## What about multiple parameters 
query = "SELECT table_schema,table_name FROM information_schema.tables where table_schema in (%s,%s)"

# create DataFrame 
my_df = pd.read_sql(query, connection, params={'mysql','information_schema'})
   
my_df


Unnamed: 0,table_schema,table_name
0,information_schema,ALL_PLUGINS
1,information_schema,APPLICABLE_ROLES
2,information_schema,CHARACTER_SETS
3,information_schema,CHECK_CONSTRAINTS
4,information_schema,COLLATIONS
...,...,...
103,mysql,time_zone_name
104,mysql,time_zone_transition
105,mysql,time_zone_transition_type
106,mysql,transaction_registry
