---
# ðŸ§­ Exporatory Data Analysis
## ðŸŽ¯ Target

In [None]:
# Visualizations

def plot_target_eda(df, target, title = f'target distribution'):
    """
    simple target distribution plot for classification or regression
    """
    if pd.api.types.is_float_dtype(df[target]) or (df[target].dtype == int and df[target].nunique() > 20):
        sns.histplot(df[target], bins = min(df[target].nunique(), 42), kde = True)
    else:
        sns.countplot(data=df, x=target)
    plt.title(title)
    plt.yticks([])
    plt.show()


def plot_features_eda(df, features, target, label, sample = 1000,
                      high_label = "Good", low_label = "Bad",
                      y_min = None, y_max = None):
    """ 
    supports feature EDA with continuous or integer targets
    may need to adjust y_max and y_min to support limits of continuous target
    """
    ### Histogram for distribution of numeric feature (num plot 0)
    def _plot_num_distribution(ax, feature):
        sns.histplot(df[feature], ax=ax, bins = 50)
        ax.set_title(f'{feature} distribution')
        ax.set_yticks([])
        ax.set_ylabel("Count")
        ax.set_xlabel("")
    
    ### Countplot for distribution of categorical feature (cat plot 0)
    def _plot_cat_distribution(ax, feature, order, color_map):
        sns.countplot(data=df, x=feature, order=order, ax=ax,
                      palette=[color_map[val] for val in order])
        ax.set_title(f'{feature} distribution')
        ax.set_yticks([])
        ax.set_ylabel("Count")
        if len(order) > 8: 
            x = ax.get_xticks()
            ax.set_xticks(x, order, rotation=90)
        if len(order) > 20:
            x = ax.get_xticks()
            labels = [s if i % 5 == 0 else "" for i, s in enumerate(order)]
            ax.set_xticks(x, labels, rotation=90)
        ax.set_xlabel("")

    ### scatterplot with trendline for numerical feature relationship to target (num plot 1)
    def _plot_num_relationship(ax, feature,  y_min=0, y_max=100):
        df_sampled = df.sample(n=min(sample, df.shape[0]), random_state=SEED)
        sns.regplot(data=df_sampled, x=feature, y=target, ax=ax,
                    scatter_kws={'alpha': 0.5, 's': 12}, line_kws={'color': 'xkcd:dusty rose', 'linestyle': "--", 'linewidth': 2})
        ax.set_title(f'{target} vs {feature}')
        ax.set_ylabel("")
        ax.set_ylim(y_min, y_max)
        ax.set_xlabel("")

    ### psedo-scatterplot with trendline for categorical feature relationship to target (cat plot 1)
    def _plot_cat_relationship(ax, feature, order, color_map, y_min=0, y_max=100):
        grouped = df.groupby(feature)
        sampled_dfs = []
        for name, group in grouped:
            frac = min(1.0, sample / len(df))
            sampled_dfs.append(group.sample(n=max(1, int(frac * len(group))), random_state=SEED))
        df_sampled = pd.concat(sampled_dfs)
        sns.stripplot(data=df_sampled, x=feature, y=target, order=order, ax=ax, zorder = 1, 
                          palette=[color_map[val] for val in order], alpha=0.5, jitter=True)
        sns.pointplot(data=df, x=feature, y=target, order=order, ax=ax, zorder = 2, 
                      color=MY_PALETTE[-1], errorbar = None)

        if len(df[target].unique()) > 5:
            for i, val in enumerate(order):
                subset = df[df[feature] == val][target].dropna()
                q25, q75 = subset.quantile([0.25, 0.75])
                ax.vlines(x=i, ymin=q25, ymax=q75, color=MY_PALETTE[-1], linewidth=2,  zorder = 3)
        
        ax.set_title(f'{target} vs {feature}')
        if len(order) > 8: 
            x = ax.get_xticks()
            ax.set_xticks(x, order, rotation=90)
        if len(order) > 20:
            x = ax.get_xticks()
            labels = [s if i % 5 == 0 else "" for i, s in enumerate(order)]
            ax.set_xticks(x, labels, rotation=90)
        ax.set_ylabel("")
        ax.set_ylim(y_min, y_max)
        ax.set_xlabel("")

    ### boxplot shows outliers and limits by label  (num plot 2)
    def _plot_num_boxplot(ax, feature, label = None, top_label="", bottom_label=""):        
        if label == None:
            sns.boxplot(x = df[feature], ax=ax)
            ax.set_title(f'{feature} outliers')
        else:
            sns.boxplot(x = df[feature], palette=MY_PALETTE , ax=ax, legend = False, gap = .1,
                        hue = df[label], hue_order = sorted(df[label].dropna().unique().tolist()))
            ax.set_title(f'{feature} by target cut')
            ax.set_xlabel("")
            ax.text(df[feature].min(), -0.45, top_label, ha='left', va='center', fontsize=8, color = 'black')
            ax.text(df[feature].min(), 0.45, bottom_label, ha='left', va='center', fontsize=8, color = 'black')
        ax.set_yticks([])

    ### donut shows variation in target by category  (cat plot 2)
    def _plot_cat_donut(ax, feature, label, order, color_map, inner_label="", outer_label=""):
        cats = sorted(df[label].dropna().unique().tolist())
        ring_width = 0.7 / len(cats)
        for i, cat in enumerate(cats):
            value_counts = df[df[label] == cat][feature].value_counts()
            sorted_counts = value_counts.reindex(order).dropna()
            if len(order) > 20:
                labels = [s if i % 5 == 0 else "" for i, s in enumerate(sorted_counts.index)]
            else: labels = sorted_counts.index
            slice_colors = [color_map[val] for val in sorted_counts.index]
            radius = 1 - ring_width * i
            ax.pie(sorted_counts, radius=radius, colors=slice_colors,
                   wedgeprops=dict(width=ring_width, edgecolor='w'),
                   labels=labels if i == 0 else None)
            ax.set_title(f'{feature} by target cut')
            ax.text(0, 0, inner_label, ha='center', va='center', fontsize=8, color = 'xkcd:steel grey')
            ax.text(-1.3, -1.3, outer_label, ha='left', va='center', fontsize=8, color = 'xkcd:steel grey')

    ### build common cmap for categoricals
    def _set_color_map(order, clrs = 6, sats = 5):
        if len(order) <= len(MY_PALETTE):
            return dict(zip(order, MY_PALETTE[:len(order)]))
        elif len(order) <= clrs * sats:
            new_palette = []
            for j in range(clrs):
                for i in range(sats):
                    new_palette.append(sns.desaturate(MY_PALETTE[j], 1-.2*i))
            return dict(zip(order, new_palette[:len(order)]))
        else:
            cmap = mpl.colormaps['cividis'].resampled(len(order))
            new_palette = [cmap(i / len(order)) for i in range(len(order))]
            return dict(zip(order, new_palette))

    ### limit number of features plotted/size of plot
    f = len(features)
    if len(features) > 20:
        print("Plotting 20 features")
        f = 20
        features = features[:20]

    ### define limits of relationship plots
    if not y_min: y_min = df[target].min()
    if not y_max: y_max = df[target].max()
    
    ### gridspec to build plot layout
    fig = plt.figure(figsize=(10, f * 3))
    gs = mpl.gridspec.GridSpec(f, 3, figure=fig, hspace=0.4)
    
    row_anchors = []
    for i, feature in enumerate(features):
        ### for each feature determine applicable plot selection
        is_cat = (df[feature].dtype == "O" or df[feature].dtype == bool or df[feature].dtype == "category" or
                  (np.issubdtype(df[feature].dtype, np.integer) and len(df[feature].dropna().unique()) < 10))
        ax0 = fig.add_subplot(gs[i, 0])
        row_anchors.append(ax0)
        if is_cat:
            order = sorted(df[feature].dropna().unique().tolist())
            color_map = _set_color_map(order)
            _plot_cat_distribution(ax0, feature, order, color_map)
            _plot_cat_relationship(fig.add_subplot(gs[i, 1]), feature, order, color_map, y_min=y_min, y_max=y_max)
            _plot_cat_donut(fig.add_subplot(gs[i, 2]), feature, label, order, color_map,
                           inner_label=low_label, outer_label=high_label)
        else:
            _plot_num_distribution(ax0, feature)
            _plot_num_relationship(fig.add_subplot(gs[i, 1]), feature, y_min=y_min, y_max=y_max)
            _plot_num_boxplot(fig.add_subplot(gs[i, 2]), feature, label, 
                            top_label=high_label, bottom_label=low_label)

    ### add tear lines between features
    for i in range(f - 1):
        bottom_y = row_anchors[i].get_position().y0
        top_y = row_anchors[i + 1].get_position().y1
        y_pos = (bottom_y + top_y) / 2
        line = mpl.lines.Line2D([0.05, 0.95], [y_pos, y_pos], transform=fig.transFigure,
                      color='black', linewidth=0.5, linestyle='--')
        fig.add_artist(line)

    plt.show()

def plot_pairplot(df, features, sample = 250, title = "", **kwargs):
    """
    pairplot for feature to feature comparisons
    """
    print("=" * 69)
    plot_df = df[features].sample(n = min(sample, df.shape[0]), random_state=SEED)
    g = sns.pairplot(plot_df, diag_kind="kde", **kwargs)
    g.map_lower(sns.kdeplot, levels=4, color="xkcd:slate")
    g.figure.suptitle(title, x = 0.98, ha = 'right', y=1.01)
    plt.show()


In [None]:
summarize_data(XY[XY.target_mask.eq(True)], target)

In [None]:
plot_target_eda(XY[XY.target_mask.eq(True)], target, title=f'{target} distribution')

In [None]:
# Cast to integer useful for some target transformations
target_int = f'{target}_int'
XY[target_int] = XY[target].astype('int')
targets.append(target_int)

# Scale and transform target
XY, TargetTransformer_mms, targets = get_transformed_target(XY, 
    target, targets,
    skl.preprocessing.MinMaxScaler((-1,1)),
    "mms")
target_mms = targets[-1]

# Normal quantile cut emphasizes the clustering at extreames
XY, TargetTransformer_n, targets = get_transformed_target(XY, target, targets,
    skl.preprocessing.QuantileTransformer(
        n_quantiles=100,
        output_distribution="normal",
        subsample=100000),
    "qn")
target_n = targets[-1]

# Uniform quantile cut transforms to uniform distribution
XY, TargetTransformer_u, targets = get_transformed_target(XY, target, targets,
    skl.preprocessing.QuantileTransformer(
        n_quantiles=100,
        output_distribution="uniform", 
        subsample=100000),
    "qu")
target_u = targets[-1]

# Binary classification targets
grade_a = 90
grade_f = 38
XY[f'{target}_a'] = XY[target_int].apply(lambda e:1 if e>grade_a else 0)
XY[f'{target}_f'] = XY[target_int].apply(lambda e:1 if e<grade_f else 0)
targets.extend([f'{target}_a', f'{target}_f'])

In [None]:
plot_target_eda(XY[XY.target_mask.eq(True)],
                target_u, title=f'{target_u} distribution')
plot_target_eda(XY[XY.target_mask.eq(True)],
                target_int, title=f'{target} distribution')

plot_target_eda(XY[XY.target_mask.eq(True) & XY.exam_score_a.eq(1)],
                target_int, title=f'High Score distribution')

plot_target_eda(XY[XY.target_mask.eq(True) & XY.exam_score_f.eq(1)],
                target_int, title=f'Low Score distribution')

#### ðŸ‘€ target observations and notesÂ¶
- target is continuous over range 0 to 100
- target has a very slight right skew, mean score 62
- historgram grouping algorithm may generate apparent gaps/drops in data
    - algorithm artifact, data appears well shaped away from extreames
- n.b. significant peaks at high end(100) and low end (19);
    - approximately 2.45% score 100, vs 0.36% score 99
    - similarly 1.09% score 19 vs 0.14 score 20
---
## ðŸ”¬ Features

In [None]:
# Show feature stats
summarize_data(XY[XY.target_mask.eq(True)], features)

summarize_data(XY[XY.target_mask.eq(True) & XY.exam_score_int.eq(100)],
                features)
summarize_data(XY[XY.target_mask.eq(True) & XY.exam_score_a.eq(1)],
                features)



In [None]:
# Clean boolean and ordinal features
XY.feature.replace({'yes': True, 'no': False}, inplace=True)
XY.feature.replace({'poor': 1, 'average': 3, 'good': 5}, inplace=True)
XY.feature.replace({'low': 1, 'medium': 3, 'high': 5}, inplace=True)

# Clean and trim categorical strings
XY = clean_categoricals(XY, features, string_length = 4)

In [None]:
plot_features_eda(XY[XY.target_mask.eq(True)],
                  features, target, 'cut_label',
                  high_label="high score", low_label="low")

In [None]:
plot_features_eda(XY[XY.target_mask.eq(True) & XY.exam_score_a.eq(1)],
                  features, target, 'cut_label',
                  high_label="high score", low_label="low")

In [None]:
#### ðŸ‘€ Feature Observations and Notes
- 11 predictive features: 4 numeric, 1 boolean, and 6 categorical
- No missing data
- [x] DONE: clean and simplify categorical string values in support of display and encoding
- [x] DONE: map boolean to True/False
- [x] DONE: assign ordinal categories numeric mapping

<span style= "color:DarkSeaGreen; font-size:16px"><strong>
Numeric features:</strong></span>
- age: near uniform distribution, **no** significant predictive power
- study_hours: near uniform with **significant peaks at high/low extreams**, **strong positive** correlation to exam scores
- class_attendance: near uniform distribution with **peak at high attendence**, **positive** correlation to exam scores
- sleep_hours: near uniform distribution periodic peaks (round numbers?), **weak positive** correlation to exam scores

<span style= "color:DarkSeaGreen; font-size:16px"><strong>
Boolean features: </strong></span>
- internet_access: 2 values [yes, no]-> 90/10 split with **very weak** predictive power

<span style= "color:DarkSeaGreen; font-size:16px"><strong>
Categorical (non-ordinal) features: </strong></span>
- gender: 3 values [female, male, other]-> uniform distribution with **no** significant independent predictive power 
- course: 7 values [b.sc, diploma, bca, b.com, ba, bba, b.tech] -> **weak** predictive power 
- study_method: 5 values [online videos, self-study, coaching, group study, mixed] -> uniform distribution with **weak** predictive power

<span style= "color:DarkSeaGreen; font-size:16px"><strong>
Categorical (ordinal) features: </strong></span>
- sleep_quality: 3 values [poor, average, good] -> uniform distribution with **strong** predictive power 
- facility_rating: 3 values [low, medium, high] -> uniform distribuion with **strong** predictive power
- exam_difficulty: 3 values [easy, moderate, hard] -> symetric distribution with **very weak** predictive power
---