## Working with JSON Files
You can hanlde JSON files directly with Pandas, using the  `read_json()` method

In [4]:
import pandas as pd
path_to_json = "../data/example-1.json"
# Load JSON file into DataFrame
df_json_data = pd.read_json(path_to_json)
# Display DataFrame
df_json_data

Unnamed: 0,Column 1,Column 2
0,1,2
1,3,4
2,5,6


In [5]:
# Convert dataframe to JSON
json_format = df_json_data.to_json()
json_format

'{"Column 1":{"0":1,"1":3,"2":5},"Column 2":{"0":2,"1":4,"2":6}}'

## Working with CSV Files

With pandas you can work with CSV files using the  `read_csv()` method


In [6]:
path_to_csv = "../data/example-1.csv"
# Load the CSV file into the DataFrame
csv_df = pd.read_csv(path_to_csv)
# Display it
csv_df

Unnamed: 0,Branch,Date,Amount
0,Branch A,January 1,500.0
1,Branch B,January 2,250.0
2,Branch A,January 3,300.0


# What if the csv file doesn't have a header?

It converts the first record as the header of the DataFrame

In [7]:
path_to_csv = "../data/example-2.csv"
no_header_csv_df = pd.read_csv(path_to_csv)
no_header_csv_df

Unnamed: 0,Branch A,January 1,500.00
0,Branch B,January 2,250.0
1,Branch A,January 3,300.0


To get around this pass the `header=None` property of the `read_csv()` method

In [8]:
no_header_csv_df = pd.read_csv(path_to_csv, header=None)
no_header_csv_df

Unnamed: 0,0,1,2
0,Branch A,January 1,500.0
1,Branch B,January 2,250.0
2,Branch A,January 3,300.0


In [10]:
# Save the data back to csv file
# IF you do not need the index info use index=None
no_header_csv_df.to_csv('test.csv', index=None)

In [15]:
path_to_csv = "../data/yeast.csv"
df_yeast = pd.read_csv(path_to_csv, header=None)
df_yeast.iloc[df_yeast[9] == "MIT", 0] = "Waldo Weber"
df_yeast


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Waldo Weber,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,Waldo Weber,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,Waldo Weber,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,Waldo Weber,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT
...,...,...,...,...,...,...,...,...,...,...
1479,YUR1_YEAST,0.81,0.62,0.43,0.17,0.5,0.0,0.53,0.22,ME2
1480,ZIP1_YEAST,0.47,0.43,0.61,0.40,0.5,0.0,0.48,0.47,NUC
1481,ZNRP_YEAST,0.67,0.57,0.36,0.19,0.5,0.0,0.56,0.22,ME2
1482,ZUO1_YEAST,0.43,0.40,0.60,0.16,0.5,0.0,0.53,0.39,NUC


In [16]:
df_yeast.describe()

Unnamed: 0,1,2,3,4,5,6,7,8
count,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0,1484.0
mean,0.500121,0.499933,0.500034,0.261186,0.504717,0.0075,0.499885,0.276199
std,0.137299,0.123924,0.08667,0.137098,0.048351,0.075683,0.057797,0.106491
min,0.11,0.13,0.21,0.0,0.5,0.0,0.0,0.0
25%,0.41,0.42,0.46,0.17,0.5,0.0,0.48,0.22
50%,0.49,0.49,0.51,0.22,0.5,0.0,0.51,0.22
75%,0.58,0.57,0.55,0.32,0.5,0.0,0.53,0.3
max,1.0,1.0,1.0,1.0,1.0,0.83,0.73,1.0


In [21]:
column = df_yeast[4]
print(
    f'Min: {column.min()} Max: {column.max()} Std: {column.std()}'
)

Min: 0.0 Max: 1.0 Std: 0.13709763089421498


## Data Correlation
- Columns that are redundant
- Change of value in one column yield an almost constant change in another
- Pair have either high (close to 1) or low (close to -1) correlation scores


In [27]:

df_yeast.corr(numeric_only=True)

Unnamed: 0,1,2,3,4,5,6,7,8
1,1.0,0.581631,-0.163951,0.158175,0.064922,0.005597,0.075043,-0.12454
2,0.581631,1.0,-0.2718,0.140314,0.060823,0.000392,0.088759,-0.102984
3,-0.163951,-0.2718,1.0,0.059668,-0.008083,0.009378,-0.185805,-0.022043
4,0.158175,0.140314,0.059668,1.0,-0.005931,-0.00904,-0.103591,-0.054797
5,0.064922,0.060823,-0.008083,-0.005931,1.0,-0.009674,0.043627,0.002829
6,0.005597,0.000392,0.009378,-0.00904,-0.009674,1.0,0.0209,-0.035659
7,0.075043,0.088759,-0.185805,-0.103591,0.043627,0.0209,1.0,0.08969
8,-0.12454,-0.102984,-0.022043,-0.054797,0.002829,-0.035659,0.08969,1.0
