In [36]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

class dslr:
    def __init__(self, path) -> None:
        self.df = pd.read_csv(path)

    def describe(self) -> pd.DataFrame:        
        df = self.df.select_dtypes(include='number')
        df = df.drop(['Index'], axis=1)
        df = df.dropna(axis=1, how='all')
        results = {}
        
        for column_name, column_data in df.items():
            values = column_data.dropna().values
            sorted_values = np.sort(values)
            count = len(sorted_values)
            mean = np.sum(sorted_values) / count
            std = np.sqrt(np.sum((sorted_values - mean) ** 2) / count)
            min_val = sorted_values[0]
            
            # Calculate quantiles without using np.percentile
            def quantile(sorted_vals, q):
                index = (len(sorted_vals) + 1) * q
                if index.is_integer():
                    return sorted_vals[int(index) - 1]
                else:
                    lower_index = int(np.floor(index)) - 1
                    upper_index = int(np.ceil(index)) - 1
                    interp = (sorted_vals[upper_index] - sorted_vals[lower_index]) * (index - np.floor(index))
                    return sorted_vals[lower_index] + interp
            
            q25 = quantile(sorted_values, 0.25)
            q50 = quantile(sorted_values, 0.5)  # This is essentially the median
            q75 = quantile(sorted_values, 0.75)
            max_val = sorted_values[-1]
            
            results[column_name] = {
                "Count": count,
                "Mean": mean,
                "Std": std,
                "Min": min_val,
                "25%": q25,
                "50%": q50,
                "75%": q75,
                "Max": max_val
            }
        
        return pd.DataFrame(results)
    
    def histogram(self):
        df = self.df
        houses = df.loc[:, 'Hogwarts House'].unique()
        courses = df.drop(['Index', 'Hogwarts House', 'First Name', 'Last Name', 'Birthday', 'Best Hand'], axis=1)
        courses = courses.dropna()
        
        courses_with_houses = courses.join(self.df['Hogwarts House'])
        
        # Color map for Hogwarts Houses
        colors = {'Gryffindor': 'red', 'Slytherin': 'green', 'Ravenclaw': 'blue', 'Hufflepuff': 'yellow'}
        
        plt.figure(figsize=(20, 16))  # Adjust figure size based on the number of rows
        
        for i, course in enumerate(courses.columns, 1):
            plt.subplot(5, 3, i)  # Use calculated number of rows and 3 columns
            for house in houses:
                # Select course scores for the current house
                house_scores = courses_with_houses[courses_with_houses['Hogwarts House'] == house][course]
                # Plot histogram for the current course and house, using the color map
                plt.hist(house_scores, alpha=0.5, bins=15, label=house, color=colors[house])
            plt.title(course)
            plt.legend()
        plt.tight_layout()
        plt.show()
            
    def scatter_plot(self):
        # Include 'Hogwarts House' with numeric features, excluding 'Index'
        numeric_df = self.df.select_dtypes(include='number')
        numeric_df = numeric_df.drop(['Index'], axis=1)
        numeric_df_with_house = numeric_df.join(self.df['Hogwarts House'])

        # Generate all unique pairs of numeric features
        features = numeric_df.columns
        feature_pairs = [(features[i], features[j]) for i in range(len(features)) for j in range(i+1, len(features))]
        houses = self.df['Hogwarts House'].unique()

        # Color map for Hogwarts Houses
        colors = {'Gryffindor': 'red', 'Slytherin': 'green', 'Ravenclaw': 'blue', 'Hufflepuff': 'yellow'}

        # Determine the grid size
        num_plots = len(feature_pairs)
        num_columns = 3  # Adjust based on preference
        num_rows = num_plots // num_columns + (1 if num_plots % num_columns else 0)

        # Create a grid of subplots
        fig, axs = plt.subplots(num_rows, num_columns, figsize=(15, num_rows * 5))
        axs = axs.flatten()  # Flatten the array for easy iteration

        # Plot scatter plot for each pair, color-coded by Hogwarts House
        for i, (feature1, feature2) in enumerate(feature_pairs):
            for house in houses:
                # Filter data for the current house
                house_data = numeric_df_with_house[numeric_df_with_house['Hogwarts House'] == house]
                # Plotting on the ith subplot
                axs[i].scatter(house_data[feature1], house_data[feature2], alpha=0.5, label=house, color=colors[house])
            correlation = numeric_df[[feature1, feature2]].corr().iloc[0, 1]
            axs[i].set_title(f'{feature1} vs {feature2}\nCorrelation: {correlation:.2f}')
            axs[i].set_xlabel(feature1)
            axs[i].set_ylabel(feature2)
            axs[i].legend()

        # Hide any unused subplots
        for j in range(i + 1, len(axs)):
            axs[j].axis('off')

        plt.tight_layout()
        plt.show()
    
    # Assuming other parts of the class remain unchanged

    def pair_plot(self):
        # Include 'Hogwarts House' with numeric features, excluding 'Index'
        numeric_df = self.df.select_dtypes(include='number')
        numeric_df = numeric_df.drop(['Index'], axis=1)
        numeric_df_with_house = numeric_df.join(self.df['Hogwarts House'])

        features = numeric_df.columns
        houses = self.df['Hogwarts House'].unique()

        # Color map for Hogwarts Houses
        colors = {'Gryffindor': 'red', 'Slytherin': 'green', 'Ravenclaw': 'blue', 'Hufflepuff': 'yellow'}

        # Determine the grid size
        n = len(features)

        # Create a grid of subplots
        fig, axs = plt.subplots(n, n, figsize=(2*n, 2*n))

        for i in range(n):
            for j in range(n):
                ax = axs[i, j]
                if i == j:  # Diagonal: plot histograms
                    for house in houses:
                        house_data = numeric_df_with_house[numeric_df_with_house['Hogwarts House'] == house][features[i]]
                        ax.hist(house_data, bins=15, alpha=0.5, color=colors[house], label=house)
                    ax.set_title(features[i])
                else:  # Off-diagonal: plot scatter plots
                    for house in houses:
                        house_data = numeric_df_with_house[numeric_df_with_house['Hogwarts House'] == house]
                        ax.scatter(house_data[features[j]], house_data[features[i]], alpha=0.5, label=house, color=colors[house])
                    if i == n - 1:  # Only label x-axis for bottom plots
                        ax.set_xlabel(features[j])
                    if j == 0:  # Only label y-axis for left plots
                        ax.set_ylabel(features[i])

                # Legend for the first subplot only to avoid repetition
                if i == 0 and j == 0:
                    ax.legend()

                # Hide ticks and labels for inner plots to reduce clutter
                if i != n - 1:  # Hide x-axis labels for all but the bottom row
                    ax.set_xticklabels([])
                    ax.set_xticks([])
                if j != 0:  # Hide y-axis labels for all but the first column
                    ax.set_yticklabels([])
                    ax.set_yticks([])

        plt.tight_layout()
        plt.show()

test = dslr("datasets/dataset_train.csv")
test.describe()
# test.histogram()
# test.scatter_plot()
# test.pair_plot()

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
Count,1566.0,1568.0,1567.0,1569.0,1561.0,1565.0,1565.0,1557.0,1566.0,1570.0,1560.0,1600.0,1600.0
Mean,49634.570243,39.797131,1.14102,-0.387863,3.15391,-224.589915,495.74797,2.963095,1030.096946,5.950373,-0.053427,-243.374409,21.958012
Std,16674.479577,520.13233,5.218016,5.211132,4.15397,486.189433,106.251202,4.424353,44.111025,3.146852,0.971146,8.780895,97.601087
Min,-24370.0,-966.740546,-10.295663,-10.162119,-8.727,-1086.496835,283.869609,-8.858993,906.62732,-4.697484,-3.313676,-261.04892,-181.47
25%,38505.25,-489.666607,-4.312118,-5.26121,3.094,-577.822502,397.435312,2.215301,1026.166262,3.644182,-0.672828,-250.66326,-41.93
50%,49013.5,260.289446,3.469012,-2.589342,4.624,-419.164294,463.918305,4.378176,1045.506996,5.874837,-0.044811,-244.867765,-2.515
75%,60850.5,525.530343,5.421046,4.90575,5.6675,256.886191,597.598097,5.830729,1058.455395,8.250169,0.592937,-232.541935,50.78
Max,104956.0,1016.21194,11.612895,9.667405,10.032,1092.388611,745.39622,11.889713,1098.958201,13.536762,3.056546,-225.42814,279.07
