# Tobacco Consumption Data Exploration

By Edson Castañeda

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

## Read Data

In [None]:
DATA_PATH = "../../data/Tobacco_Consumption.csv"
tobacco_data = pd.read_csv(DATA_PATH)
tobacco_data.sample(10)

## Exploration

In [None]:
tobacco_data.shape

In [None]:
tobacco_data.info()

### Categorical Data

In [None]:
tobacco_categorical_data = tobacco_data.select_dtypes(exclude=['int', 'float'])
print(f"Total Categorical Columns: {len(tobacco_categorical_data.columns)}")
tobacco_categorical_data.head(10)

In [None]:
fig, ax = plt.subplots(2,3, figsize=(20, 15))
fig.subplots_adjust(hspace=.5)
i = 0
for col in tobacco_categorical_data.columns:
    sns.countplot(tobacco_categorical_data[col], ax=ax[i%2, math.floor(i/2)])
    i+=1

for ax in fig.axes:
    plt.sca(ax)
    plt.xticks(rotation=45)


For *LocationDesc* and *LocationAbbrev* columns there is only one unique value each. Therefore, these columns are constants.

#### Categorical data combinations

In [None]:
tobacco_categorical_data.drop_duplicates()

In [None]:
total_data = len(tobacco_categorical_data)
different_combinations = len(tobacco_categorical_data.drop_duplicates())
print(f"Total combinations of categories (rows): {total_data}")
print(f"Find {different_combinations} unique category combinations")
print(f"Relation: {total_data/different_combinations}")

In [None]:
len(tobacco_data['Year'].unique())

The categories in the dataframe are *repeated by year*. Each year has the same 13 combinations of categories.

### Numerical Data

In [None]:
tobacco_numerical_data = tobacco_data.select_dtypes(include=['int', 'float'])
print(f"Total Numerical Columns: {len(tobacco_numerical_data.columns)}")
tobacco_numerical_data.head(10)

In [None]:
# Explore correlations
correlations = tobacco_numerical_data.corr()
sns.heatmap(correlations, annot=True)
plt.show()

*Per capita values* have a strong correlation with normal values. 

In [None]:
relation_per_capita = round(tobacco_numerical_data["Total"]/tobacco_numerical_data["Population"], 1) - tobacco_numerical_data["Total Per Capita"]
round(relation_per_capita.median(), 3)


Per_capita_columns = (Normal_column)/(Population)

For further analysis, per capita columns are excluded.

*Domestic* and *Imports* have a strong correlation to *Total* column.

In [None]:
difference_total = tobacco_numerical_data["Total"]- tobacco_numerical_data["Domestic"] - tobacco_numerical_data["Imports"]
difference_total.median()

Total = Imports + Domestic

*Year* and *Population* have a strong correation with each other, but a low correation to tobacco values.