# Loading CSV

In [2]:
# import modules
import pandas as pd
import numpy as np

In [55]:
# Create dataframe (that we will be importing)
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', 'Jacobson', ".", 'Milner', 'Cooze'],
        'age': [42, 52, 36, 24, 73],
        'preTestScore': [4, 24, 31, ".", "."],
        'postTestScore': ["25,000", "94,000", 57, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25000
1,Molly,Jacobson,52,24,94000
2,Tina,.,36,31,57
3,Jake,Milner,24,.,62
4,Amy,Cooze,73,.,70


In [56]:
# Save dataframe as csv in the working directory
df.to_csv('./data/example.csv')

In [57]:
# Load a csv
df = pd.read_csv('./data/example.csv')
df

Unnamed: 0.1,Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,0,Jason,Miller,42,4,25000
1,1,Molly,Jacobson,52,24,94000
2,2,Tina,.,36,31,57
3,3,Jake,Milner,24,.,62
4,4,Amy,Cooze,73,.,70


In [58]:
# Load a csv and set the index column
df = pd.read_csv('./data/example.csv', index_col=0)
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25000
1,Molly,Jacobson,52,24,94000
2,Tina,.,36,31,57
3,Jake,Milner,24,.,62
4,Amy,Cooze,73,.,70


In [59]:
# Load a csv with no headers
df = pd.read_csv('./data/example.csv', header=None)
df

Unnamed: 0,0,1,2,3,4,5
0,,first_name,last_name,age,preTestScore,postTestScore
1,0.0,Jason,Miller,42,4,25000
2,1.0,Molly,Jacobson,52,24,94000
3,2.0,Tina,.,36,31,57
4,3.0,Jake,Milner,24,.,62
5,4.0,Amy,Cooze,73,.,70


In [60]:
# Load a csv while specifying column names
df = pd.read_csv('./data/example.csv', skiprows=1,
                 names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])
df

Unnamed: 0,UID,First Name,Last Name,Age,Pre-Test Score,Post-Test Score
0,0,Jason,Miller,42,4,25000
1,1,Molly,Jacobson,52,24,94000
2,2,Tina,.,36,31,57
3,3,Jake,Milner,24,.,62
4,4,Amy,Cooze,73,.,70


In [61]:
# Load a csv with setting the index column to UID
df = pd.read_csv('./data/example.csv', index_col='UID', skiprows=1,
                 names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])
df

Unnamed: 0_level_0,First Name,Last Name,Age,Pre-Test Score,Post-Test Score
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Jason,Miller,42,4,25000
1,Molly,Jacobson,52,24,94000
2,Tina,.,36,31,57
3,Jake,Milner,24,.,62
4,Amy,Cooze,73,.,70


In [62]:
# Load a csv while setting the index columns to First Name and Last Name
df = pd.read_csv('./data/example.csv', index_col=['First Name', 'Last Name'], skiprows=1,
                 names=['UID', 'First Name', 'Last Name', 'Age', 'Pre-Test Score', 'Post-Test Score'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,UID,Age,Pre-Test Score,Post-Test Score
First Name,Last Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jason,Miller,0,42,4,25000
Molly,Jacobson,1,52,24,94000
Tina,.,2,36,31,57
Jake,Milner,3,24,.,62
Amy,Cooze,4,73,.,70


In [63]:
# Load a csv while specifying "." as missing values
df = pd.read_csv('./data/example.csv', index_col=0, na_values=['.'])
pd.isnull(df)

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,False,False,False,False,False
1,False,False,False,False,False
2,False,True,False,False,False
3,False,False,False,True,False
4,False,False,False,True,False


In [64]:
# Load a csv while specifying "." and "NA" as missing values in the Last Name column 
# and "." as missing values in Pre-Test Score column
sentinels = {'last_name': ['.', 'NA'], 'preTestScore': ['.']}
f = pd.read_csv('./data/example.csv', na_values=sentinels)
f

Unnamed: 0.1,Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,0,Jason,Miller,42,4.0,25000
1,1,Molly,Jacobson,52,24.0,94000
2,2,Tina,,36,31.0,57
3,3,Jake,Milner,24,,62
4,4,Amy,Cooze,73,,70


In [65]:
# Load a csv while skipping the top 3 rows
df = pd.read_csv('./data/example.csv', na_values=sentinels, skiprows=3)
df

Unnamed: 0,2,Tina,.,36,31,57
0,3,Jake,Milner,24,.,62
1,4,Amy,Cooze,73,.,70


In [66]:
# Load a csv while interpreting "," in strings around numbers as thousands seperators
df = pd.read_csv('./data/example.csv', index_col=0, thousands=',')
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42,4,25000
1,Molly,Jacobson,52,24,94000
2,Tina,.,36,31,57
3,Jake,Milner,24,.,62
4,Amy,Cooze,73,.,70


# Unicode Errors

Often times, you will run into errors about text encoding and formatting. This may be due to foreign characters in the document or problems with exporting files. In some cases, these files were created by different applications or countries. The default encoding is 'utf-8'. Here is a list of [other standard encodings](https://docs.python.org/3/library/codecs.html#standard-encodings).

In [3]:
df = pd.read_csv('./data/nba.csv')
df.head()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 14: unexpected end of data

This file was generated by someone in Europe so I will have to define the encoding as either `'latin-1'` or `'iso-8859-1'`.

In [8]:
df = pd.read_csv('./data/nba.csv', encoding='Latin-1')
df.head()

Unnamed: 0,Name,Age,Team,POS,#,2013 $,Ht (In.),WT,EXP,1st Year,DOB,School,City,"State (Province, Territory, Etc..)",Country,Race,HS Only
0,"Gee, Alonzo",26,Cavaliers,F,33,"$3,250,000",78,219,4,2009,5/29/1987,Alabama,"Riviera Beach, FL",Florida,US,Black,No
1,"Wallace, Gerald",31,Celtics,F,45,"$10,105,855",79,220,12,2001,7/23/1982,Alabama,"Sylacauga, AL",Alabama,US,Black,No
2,"Williams, Mo",30,Trail Blazers,G,25,"$2,652,000",73,195,10,2003,12/19/1982,Alabama,"Jackson, MS",Mississippi,US,Black,No
3,"Gladness, Mickell",27,Magic,C,40,"$762,195",83,220,2,2011,7/26/1986,Alabama A&M,"Birmingham, AL",Alabama,US,Black,No
4,"Jefferson, Richard",33,Jazz,F,44,"$11,046,000",79,230,12,2001,6/21/1980,Arizona,"Los Angeles, CA",California,US,Black,No


Normally you can type `file -I <filename>` in the command line and get an output. In this case, it is still unknown. In those cases, you should check with the original author or source.

In [11]:
# to run shell commands in Jupyter put an exclamation point
!file -I nba.csv

nba.csv: text/plain; charset=unknown-8bit


# Loading JSON

In [67]:
# Read JSON file
import json
from pprint import pprint

with open('./data/example_1.json') as data_file:
    data = json.load(data_file)

pprint(data)

{'maps': [{'id': 'blabla', 'iscategorical': '0'},
          {'id': 'blabla', 'iscategorical': '0'}],
 'masks': {'id': 'valore'},
 'om_points': 'value',
 'parameters': {'id': 'valore'}}


In [68]:
print(data["maps"][0]["id"])
print(data["masks"]["id"])
print(data["om_points"])

blabla
valore
value


In [69]:
# Read JSON file
import json
from pprint import pprint

with open('./data/example_2.json') as data_file:
    data = json.load(data_file)
    
pprint(data)

{'quiz': {'maths': {'q1': {'answer': '12',
                           'options': ['10', '11', '12', '13'],
                           'question': '5 + 7 = ?'},
                    'q2': {'answer': '4',
                           'options': ['1', '2', '3', '4'],
                           'question': '12 - 8 = ?'}},
          'sport': {'q1': {'answer': 'Los Angeles Lakers',
                           'options': ['New York Knicks',
                                       'Los Angeles Lakers',
                                       'Golden State Warriors',
                                       'Houston Rockets'],
                           'question': 'Which team did Kobe Bryant play '
                                       'for?'}}}}


In [70]:
data["quiz"]["maths"]

{'q1': {'answer': '12',
  'options': ['10', '11', '12', '13'],
  'question': '5 + 7 = ?'},
 'q2': {'answer': '4',
  'options': ['1', '2', '3', '4'],
  'question': '12 - 8 = ?'}}

In [71]:
data["quiz"]["sport"]["q1"]["options"]

['New York Knicks',
 'Los Angeles Lakers',
 'Golden State Warriors',
 'Houston Rockets']