Published on January 22, 2023. By Marília Prata, mpwolke

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import plotly.graph_objs as go
import plotly.offline as py
import plotly.express as px

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://www.frontiersin.org/files/Articles/675695/frym-10-675695-HTML-r1/image_m/figure-1.jpg)https://kids.frontiersin.org/articles/10.3389/frym.2022.675695

#Rhodolith Beds

"Rhodoliths (from Greek for red rocks) are colorful, unattached calcareous nodules, composed of crustose, benthic marine red algae that resemble coral. Rhodolith beds create biogenic habitat for diverse benthic communities. The rhodolithic growth habit has been attained by a number of unrelated coralline red algae, organisms that deposit calcium carbonate within their cell walls to form hard structures or nodules that resemble beds of coral."

"Rhodoliths do not attach themselves to the rocky seabed. Rather, they roll like tumbleweeds along the seafloor until they become too large in size to be mobilised by the prevailing wave and current regime. They may then become incorporated into a semi-continuous algal mat or form an algal build-up. While corals are animals that are both autotrophic (photosynthesize via their symbionts) or heterotrophic (feeding on plankton), rhodoliths produce energy solely through photosynthesis (i.e. they can only grow and survive in the photic zone of the ocean)."

"Scientists believe rhodoliths have been present in the world's oceans since at least the Eocene epoch, some 55 million years ago."

https://en.wikipedia.org/wiki/Rhodolith

In [None]:
df = pd.read_csv('/kaggle/input/rhodolith-beds-in-northern-new-zealand/Rhodolith_Beds_in_Northern_New_Zealand.csv', delimiter=',', encoding='utf-8')
pd.set_option('display.max_columns', None)



#By Georgii Vishnya
df.head(5).style.set_caption("Sample of the Rhodolith Beds data"). \
set_properties(**{'border': '1.3px solid blue',
                          'color': 'red'})

#Missing Values

In [None]:

#By FZAbbasi https://www.kaggle.com/code/fzabbasi/stroke-disease-classification/notebook

df_null = df.isnull().sum()
plt.figure (figsize= (12,8))
df_null.plot(color="lightsalmon")
plt.title ("Missing values" , fontsize =20)
plt.xticks (fontsize=10)
plt.show()

![](https://www.mdpi.com/jmse/jmse-08-00813/article_deploy/html/images/jmse-08-00813-g003.png)https://www.mdpi.com/2077-1312/8/10/813

#Rhodolith Beds Heterogeneity along the Apulian Continental Shelf (Mediterranean Sea)


Citation: Chimienti, G.; Rizzo, L.; Kaleb, S.; Falace, A.; Fraschetti, S.; Giosa, F.D.; Tursi, A.; Barbone, E.; Ungaro, N.; Mastrototaro, F. Rhodolith Beds Heterogeneity along the Apulian Continental Shelf (Mediterranean Sea). J. Mar. Sci. Eng. 2020, 8, 813. https://doi.org/10.3390/jmse8100813

"Rhodolith beds represent a key habitat worldwide, from tropical to polar ecosystems. Despite this habitat is considered a hotspot of biodiversity, providing a suite of ecosystem goods and services, still scarce quantitative information is available thus far about rhodolith beds occurrence and ecological role, especially in the Mediterranean Sea. This study reports the composition and patterns of distribution of rhodolith assemblages found in four study areas along ca. 860 km of coast in the Central Mediterranean Sea. These rhodolith beds were studied for the first time and significant differences at all spatial scales have been highlighted, documenting the high variability of this habitat. Rhodolith species composition, morphology and distribution have been discussed considering the potential role of environmental factors in driving these patterns. The need for improving their protection is discussed to complement present conservation and management initiatives, particularly in the frame of the EU Marine Strategy Framework Directive."

https://www.mdpi.com/2077-1312/8/10/813

#Taxonomy: Genus

In [None]:
ax = df['genus'].value_counts()[:20].plot.barh(figsize=(16, 8), color='orange')
ax.set_title('Rhodolith Beds Genus', size=18, color='green')
ax.set_ylabel('Genus', size=10)
ax.set_xlabel('Count', size=10);

Taxonomy: Family

In [None]:
ax = df['family'].value_counts()[:20].plot.barh(figsize=(16, 8), color='green')
ax.set_title('Rhodolith Beds Family', size=18, color='orange')
ax.set_ylabel('Family', size=10)
ax.set_xlabel('Count', size=10);

In [None]:
text_cols = ['phylum', 'class', 'order_', 'subgenus']

from wordcloud import WordCloud, STOPWORDS

wc = WordCloud(stopwords = set(list(STOPWORDS) + ['|']), random_state = 42, background_color='white',
                    color_func=lambda *args, **kwargs: "black")
fig, axes = plt.subplots(2, 2, figsize=(20, 12))
axes = [ax for axes_row in axes for ax in axes_row]

for i, c in enumerate(text_cols):
  op = wc.generate(str(df[c]))
  _ = axes[i].imshow(op)
  _ = axes[i].set_title(c.upper(), fontsize=24)
  _ = axes[i].axis('off')

#EDA with Classical Approach

In [None]:
#By Furkan Akdag https://www.kaggle.com/code/furkannakdagg/smoking-drinking-prediction-complete-eda-pycaret/notebook

# Visualization Libraries 📊
# ------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

# Machine Learning Models 🤖
# --------------------------------------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score

# Customize to Remove Warnings and Better Observation 🔧
# --------------------------------------------------------
from termcolor import colored
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 300)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
#By Furkan Akdag https://www.kaggle.com/code/furkannakdagg/smoking-drinking-prediction-complete-eda-pycaret/notebook

def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    display(dataframe.head(head))
    print("##################### Tail #####################")
    display(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
   # display(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T) #Commented since we have str on cat features

check_df(df)

#Columns

In [None]:
#By Furkan Akdag https://www.kaggle.com/code/furkannakdagg/smoking-drinking-prediction-complete-eda-pycaret/notebook

def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
    It gives the names of categorical, numerical, and categorical but cardinal variables in the data set.
    Note: Categorical variables with numerical appearance are also included in categorical variables.

    Parameters
    ------
        dataframe: dataframe
                dataframe
        cat_th: int, optional
                threshold value for variables that appear numeric but are categorical
        car_th: int, optional
                threshold value for categorical but cardinal variables

    Returns
    ------
        cat_cols: list
                Categorical variable list
        num_cols: list
                Numerical variable list
        cat_but_car: list
                Cardinal variable list

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = total number of variables
        num_but_cat is inside cat_cols

    """
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car


cat_cols, num_cols, cat_but_car = grab_col_names(df)
print(f"\n{colored('Numerical Columns:','blue', attrs=['reverse'])} {num_cols}\n\n\n{colored('Categorical Columns:','magenta', attrs=['reverse'])} {cat_cols}\n\n\n"
        f"{colored('Cardinal Columns:','cyan', attrs=['reverse'])}{cat_but_car}\n")

#Numerical Columns

In [None]:
#By Furkan Akdag https://www.kaggle.com/code/furkannakdagg/smoking-drinking-prediction-complete-eda-pycaret/notebook

def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show(block=True)

    print("#####################################")


for col in num_cols:
    num_summary(df, col, True)

#Categorical Columns

In [None]:
#By Furkan Akdag https://www.kaggle.com/code/furkannakdagg/smoking-drinking-prediction-complete-eda-pycaret/notebook

def cat_summary(dataframe, col_name, plot=False):
    display(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))

    if plot:
        sns.countplot(x=dataframe[col_name], data=dataframe)
        plt.show()


for col in cat_cols:
    cat_summary(df, col)
    print("")

#Correlation Analysis

In [None]:
#By Furkan Akdag https://www.kaggle.com/code/furkannakdagg/smoking-drinking-prediction-complete-eda-pycaret/notebook

#Pick numerical variables to make the correlation matrix

interp = df[['id', 'occurrenceID', 'year', 'month', 'minimumDepthInMeters', 'maximumDepthInMeters', 'decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters', 'FID']]


def corr_map(interp, width=14, height=6, annot_kws=15):
    mtx = np.triu(interp.corr())
    f, ax = plt.subplots(figsize = (width,height))
    sns.heatmap(interp.corr(),
                annot= True,
                fmt = ".2f",
                ax=ax,
                vmin = -1,
                vmax = 1,
                cmap = "summer",
                mask = mtx,
                linewidth = 0.4,
                linecolor = "black",
                cbar=False,
                annot_kws={"size": annot_kws})
    plt.yticks(rotation=0,size=15)
    plt.xticks(rotation=75,size=15)
    plt.title('\nCorrelation of the Interpolated data\n', size = 20)
    plt.show();

corr_map(interp, width=20, height=10, annot_kws=8)

In [None]:
#By Mike Delong https://www.kaggle.com/code/mikedelong/cats-on-maps

from plotly.express import scatter_geo
from plotly.colors import qualitative
scatter_geo(data_frame=df, lat='decimalLatitude', lon='decimalLongitude', fitbounds='locations',
           color='locality', hover_name='locality', # hover_data=['timestamp'],
           color_discrete_sequence=qualitative.Alphabet, facet_col='fieldNumber', height=1200, facet_col_wrap=1, title= 'Rhodolith Beds Location')

#Acknowledgements:

Georgii Vishnya

Furkan Akdag https://www.kaggle.com/code/furkannakdagg/smoking-drinking-prediction-complete-eda-pycaret/notebook

Mike Delong https://www.kaggle.com/code/mikedelong/cats-on-maps

FZAbbasi https://www.kaggle.com/code/fzabbasi/stroke-disease-classification/notebook

mpwolke https://www.kaggle.com/code/mpwolke/interpolated-imputated-data