# EDA

## Working on dmbi-2019-taboola-challenge data

In [None]:
import scipy
import numpy as np
import pandas as pd
import missingno as msno
import seaborn as sns
import pandas_profiling
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
%matplotlib inline
pd.set_option('display.max_columns', 40)

In [None]:
file = r"../data/raw/preprocess.csv"
df = pd.read_csv(file)

In [None]:
df.shape

In [None]:
df.head(2)

In [None]:
df.columns

In [None]:
cols_numeric = []
cols_categorical = ['campaign_language', 'quality_level', 'ad_type',
                    'source_item_type', 'browser_platform', 'os_family']

In [None]:
df['is_click'].value_counts()

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.astype("object").describe().T

In [None]:
df.isna().sum()

In [None]:
for col in df.columns:
    print("{}: {}".format(col, df[col].nunique()))

In [None]:
df.isnull().any()

In [None]:
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

results = {col: cramers_v(pd.crosstab(df["is_click"], df[col]).values) for col in cols_categorical}

for col, result in sorted(results.items(), key=lambda kv: kv[1]):    
    print(f'correlation between the label and {col} is: {result}')

### Headmap for all attributes in the data provided. As goes darker, correlation between varivles increases.

In [None]:
corrmat = df[['is_click'] + cols_numeric].corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.9)

In [None]:
# Distance
sns.boxplot(df['quality_level'])

In [None]:
# Scatter
fig, ax = plt.subplots(figsize=(16, 8))
ax.scatter(df['empiric_recs'], df['empiric_clicks'])
ax.set_xlabel('X label')
ax.set_ylabel('Y label')
plt.show()

## Detect Outliers

In [None]:
df.std()

In [None]:
# IQR
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
iqr = q3 - q1
iqr

In [None]:
# Remove outlier with IQR
df[~((df < (q1 - 1.5 * iqr)) | (df > (q3 + 1.5 * iqr))).any(axis=1)]

In [None]:
# Variance of the dataframe
df[cols_numeric].var()

In [None]:
# Multi-columns frequency count
count = df.groupby(cols_categorical).size()
print(count)

In [None]:
sns.distplot(df['hour'], fit=scipy.stats.norm)

In [None]:
sns.distplot(np.log(df['content_category'] + 1), fit=scipy.stats.norm)

In [None]:
df.groupby('is_click')['user_prb'].value_counts().plot(kind='line', figsize=(10, 8), color='r')

In [None]:
for col in df.columns:
    print(f"'{col}' head:\n{df[col].value_counts(normalize=True).head()}\n\n'{col}' tail:\n{df[col].value_counts(normalize=True).tail()}\n\n")

In [None]:
df.apply(pd.Series.value_counts)

## pandas profiling

In [None]:
profile = df.profile_report()
profile.to_file(output_file='report_file.html')

In [None]:
rejected_variables = profile.get_rejected_variables(threshold=0.9)
rejected_variables

In [None]:
df.profile_report(style={'full_width': True})

## Conclusion