# Tobacco Consumption Data Exploration

By Edson Castañeda

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

## Read Data

In [None]:
DATA_PATH = "../../data/Tobacco_Consumption.csv"
tobacco_data = pd.read_csv(DATA_PATH)
tobacco_data.sample(10)

In [None]:
tobacco_data.shape

In [None]:
tobacco_data.info()

In [None]:
tobacco_categorical_data = tobacco_data.select_dtypes(exclude=['int', 'float'])
print(f"Total Categorical Columns: {len(tobacco_categorical_data.columns)}")
tobacco_categorical_data.head(10)

In [None]:
fig, ax = plt.subplots(2,3, figsize=(20, 15))
fig.subplots_adjust(hspace=.5)
i = 0
for col in tobacco_categorical_data.columns:
    sns.countplot(tobacco_categorical_data[col], ax=ax[i%2, math.floor(i/2)])
    i+=1

for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=45)


For *LocationDesc* and *LocationAbbrev* columns there is only one unique value each. Therefore, these columns are constants.

### Categorical data combinations

In [None]:
tobacco_categorical_data.drop_duplicates()

In [None]:
total_data = len(tobacco_categorical_data)
different_combinations = len(tobacco_categorical_data.drop_duplicates())
print(f"Total combinations of categories (rows): {total_data}")
print(f"Find {different_combinations} unique category combinations")
print(f"Relation: {total_data/different_combinations}")

In [None]:
len(tobacco_data['Year'].unique())

The categories in the dataframe are *repeated by year*. Each year has the same 13 combinations of categories.