## Importing Packages

In [59]:
import pandas as pd
import seaborn as sns
import urllib.request, urllib.parse
from urllib.error import HTTPError, URLError
import json

  import pandas.util.testing as tm


## Uploading CSV File

In [2]:
coral = pd.read_csv(r"C:\Users\datre\OneDrive\Documents\Graduate School\Summer '20\Okazaki_2013.tab", sep = '\t', skiprows = 35)

In [3]:
coral.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 21 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Treat                                                             142 non-null    object 
 1   Date                                                              142 non-null    object 
 2   Species                                                           142 non-null    object 
 3   ID (coral)                                                        142 non-null    object 
 4   Omega Arg (Calculated using CO2SYS)                               142 non-null    float64
 5   Calc rate CaCO3 [mmol/m**2/h] (Alkalinity anomaly technique ...)  140 non-null    float64
 6   SA [cm**2]                                                        139 non-null    float64
 7   Temp [°C]                          

In [4]:
coral

Unnamed: 0,Treat,Date,Species,ID (coral),Omega Arg (Calculated using CO2SYS),Calc rate CaCO3 [mmol/m**2/h] (Alkalinity anomaly technique ...),SA [cm**2],Temp [°C],Sal,PN [mmol/m**2/h],...,"pH (total scale, Potentiometric)",CSC flag (Calculated using seacarb afte...),CO2 [µmol/kg] (Calculated using seacarb afte...),pCO2water_SST_wet [µatm] (Calculated using seacarb afte...),fCO2water_SST_wet [µatm] (Calculated using seacarb afte...),[HCO3]- [µmol/kg] (Calculated using seacarb afte...),[CO3]2- [µmol/kg] (Calculated using seacarb afte...),DIC [µmol/kg] (Calculated using seacarb afte...),Omega Arg (Calculated using seacarb afte...),Omega Cal (Calculated using seacarb afte...)
0,ambient,2007-09-12,Siderastrea radians (coral),A1,3.94,5.51,73.2,29.80,33.520,14.575,...,8.072,8,8.97,353,352,1625,239,1873,3.92,5.87
1,ambient,2007-09-12,Siderastrea radians (coral),A10,4.12,5.55,45.9,30.32,33.390,15.710,...,8.089,8,8.48,337,336,1609,250,1867,4.11,6.16
2,ambient,2007-09-12,Siderastrea radians (coral),A13,4.12,6.50,30.3,30.32,33.390,25.255,...,8.089,8,8.48,337,336,1609,250,1867,4.11,6.16
3,ambient,2007-09-12,Siderastrea radians (coral),A5,3.94,4.51,61.4,29.85,33.520,15.620,...,8.072,8,8.96,353,352,1625,239,1872,3.93,5.88
4,ambient,2007-09-12,Solenastrea hyades (coral),S2,3.94,,,29.80,33.520,,...,8.072,8,8.97,353,352,1625,239,1873,3.92,5.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,high CO2,2009-03-31,Siderastrea radians (coral),A23,3.32,3.89,42.6,26.41,37.343,10.770,...,7.951,8,14.10,522,520,1872,201,2086,3.15,4.74
138,high CO2,2009-03-31,Siderastrea radians (coral),A24,3.01,5.31,33.3,27.35,37.334,18.920,...,7.908,8,15.68,594,592,1919,193,2128,3.04,4.56
139,high CO2,2009-03-31,Siderastrea radians (coral),A25,2.76,7.84,20.5,27.35,37.334,32.760,...,7.863,8,17.68,670,668,1950,177,2145,2.78,4.18
140,high CO2,2009-03-31,Solenastrea hyades (coral),S21,2.88,5.71,176.5,26.41,37.343,12.920,...,7.886,8,16.49,611,609,1886,174,2076,2.73,4.11


For ease of understanding, a fahrenheit temperature column will be added. Additionally, column labels will be shortened for coding efficiency. As of now, the Treat column is categorical and considered an object, for analyses purposes, this column will be changed to binary category column.

## Changing Variables

In [35]:
temp_f = (coral["Temp [°C]"] * 9/5) + 32
temp_f

0      85.640
1      86.576
2      86.576
3      85.730
4      85.640
        ...  
137    79.538
138    81.230
139    81.230
140    79.538
141    79.538
Name: Temp [°C], Length: 142, dtype: float64

In [26]:
coral["Treat"].head(77)

0      ambient
1      ambient
2      ambient
3      ambient
4      ambient
        ...   
72     ambient
73     ambient
74     ambient
75    high CO2
76    high CO2
Name: Treat, Length: 77, dtype: object

The separation from ambient to high CO2 is at observation 75

In [30]:
coral["Treat"] = coral["Treat"].astype('category')
coral["Treat"] = coral["Treat"].cat.codes
coral.dtypes

Treat                                                                  int8
Date                                                                 object
Species                                                              object
ID (coral)                                                           object
Omega Arg (Calculated using CO2SYS)                                 float64
Calc rate CaCO3 [mmol/m**2/h] (Alkalinity anomaly technique ...)    float64
SA [cm**2]                                                          float64
Temp [°C]                                                           float64
Sal                                                                 float64
PN [mmol/m**2/h]                                                    float64
AT [µmol/kg] (Potentiometric titration)                             float64
pH (total scale, Potentiometric)                                    float64
CSC flag (Calculated using seacarb afte...)                           int64
CO2 [µmol/kg

## Cleaning DataFrame

In [34]:
treat = coral["Treat"]
date = coral["Date"]
species = coral["Species"]
temp_c = coral["Temp [°C]"]
sal = coral["Sal"]
calc_rate = coral["Calc rate CaCO3 [mmol/m**2/h] (Alkalinity anomaly technique ...)"]
pH = coral["pH (total scale, Potentiometric)"]
CO2 = coral["CO2 [µmol/kg] (Calculated using seacarb afte...)"]
pCO2 = coral["pCO2water_SST_wet [µatm] (Calculated using seacarb afte...)"]
HCO3 = coral["[HCO3]- [µmol/kg] (Calculated using seacarb afte...)"]

In [50]:
coral_clean = coral.rename(columns={"Treat":"treat", "Date":"date", "Species":"species", "Temp [°C]":"temp_c", "Sal":"sal", "Calc rate CaCO3 [mmol/m**2/h] (Alkalinity anomaly technique ...)":"calc_rate", "pH (total scale, Potentiometric)":"pH", "CO2 [µmol/kg] (Calculated using seacarb afte...)":"CO2", "pCO2water_SST_wet [µatm] (Calculated using seacarb afte...)":"pCO2", "[HCO3]- [µmol/kg] (Calculated using seacarb afte...)":"HCO3"})
del coral_clean["ID (coral)"]
del coral_clean["Omega Arg (Calculated using CO2SYS)"]
del coral_clean["SA [cm**2]"]
del coral_clean["PN [mmol/m**2/h]"]
del coral_clean["AT [µmol/kg] (Potentiometric titration)"]
del coral_clean["CSC flag (Calculated using seacarb afte...)"]
del coral_clean["fCO2water_SST_wet [µatm] (Calculated using seacarb afte...)"]
del coral_clean["[CO3]2- [µmol/kg] (Calculated using seacarb afte...)"]
del coral_clean["DIC [µmol/kg] (Calculated using seacarb afte...)"]
del coral_clean["Omega Arg (Calculated using seacarb afte...)"]
del coral_clean["Omega Cal (Calculated using seacarb afte...)"]
coral_clean["temp_f"] = temp_f
coral_clean.head()

Unnamed: 0,treat,date,species,calc_rate,temp_c,sal,pH,CO2,pCO2,HCO3,temp_f
0,0,2007-09-12,Siderastrea radians (coral),5.51,29.8,33.52,8.072,8.97,353,1625,85.64
1,0,2007-09-12,Siderastrea radians (coral),5.55,30.32,33.39,8.089,8.48,337,1609,86.576
2,0,2007-09-12,Siderastrea radians (coral),6.5,30.32,33.39,8.089,8.48,337,1609,86.576
3,0,2007-09-12,Siderastrea radians (coral),4.51,29.85,33.52,8.072,8.96,353,1625,85.73
4,0,2007-09-12,Solenastrea hyades (coral),,29.8,33.52,8.072,8.97,353,1625,85.64


Variables not being used were removed from this data set and the fahrenheit column was added. 

## Finding Null Values

In [51]:
coral_clean.isnull().sum()

treat        0
date         0
species      0
calc_rate    2
temp_c       0
sal          0
pH           0
CO2          0
pCO2         0
HCO3         0
temp_f       0
dtype: int64

In [56]:
coral_clean = coral_clean.dropna()
coral_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 0 to 141
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   treat      140 non-null    int8   
 1   date       140 non-null    object 
 2   species    140 non-null    object 
 3   calc_rate  140 non-null    float64
 4   temp_c     140 non-null    float64
 5   sal        140 non-null    float64
 6   pH         140 non-null    float64
 7   CO2        140 non-null    float64
 8   pCO2       140 non-null    int64  
 9   HCO3       140 non-null    int64  
 10  temp_f     140 non-null    float64
dtypes: float64(6), int64(2), int8(1), object(2)
memory usage: 12.2+ KB


In [57]:
coral_clean.isnull().sum()

treat        0
date         0
species      0
calc_rate    0
temp_c       0
sal          0
pH           0
CO2          0
pCO2         0
HCO3         0
temp_f       0
dtype: int64

There were two null values in the calcification rate column. Since there is no way to easily estimate this value at this time, those two observations were removed.

## Finding Duplicates

In [71]:
coral_clean[coral_clean.duplicated()]

Unnamed: 0,treat,date,species,calc_rate,temp_c,sal,pH,CO2,pCO2,HCO3,temp_f


There are no duplicate observations in this data.