In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import math

# https://www.kaggle.com/datasets/lainguyn123/student-performance-factors
# Load data from the CSV file
df = pd.read_csv('archive\\StudentPerformanceFactors.csv')

df.info()

In [None]:
def descriptive_stats(df):
    # Select numerical columns only
    numerical_data = df.select_dtypes(include=[np.number])
    
    # Create a dictionary to store statistics
    stats_dict = {}

    for col in numerical_data.columns:
        col_data = numerical_data[col].dropna()  # Drop NaN values
        
        # Calculate basic statistics using pandas methods for mean, median, std, min, max, percentiles
        mean = col_data.mean()
        median = col_data.median()
        mode = col_data.mode()[0] if not col_data.mode().empty else np.nan
        std_dev = col_data.std()
        variance = col_data.var()
        min_value = col_data.min()
        max_value = col_data.max()
        range_value = max_value - min_value
        percentiles = np.percentile(col_data, [25, 50, 75])
        
        # Using scipy.stats.describe() for additional info like kurtosis and skewness if needed
        scipy_describe = stats.describe(col_data)
        
        # Store all statistics in a dictionary
        stats_dict[col] = {
            'Mean': mean,
            'Median': median,
            'Mode': mode,
            'Standard Deviation': std_dev,
            'Variance': variance,
            'Min': min_value,
            'Max': max_value,
            'Range': range_value,
            '25th Percentile': percentiles[0],
            '50th Percentile (Median)': percentiles[1],
            '75th Percentile': percentiles[2]
        }
    
    # Convert the dictionary to a pandas DataFrame for a clean output
    stats_df = pd.DataFrame(stats_dict).T
    return stats_df


# Get descriptive statistics
stats_summary = descriptive_stats(df)
print("Basic Descriptive Statistics:")
print(stats_summary)


In [None]:
# VISUALIZATION

def plot_histograms(df):
    # Plot histograms for numerical columns
    numerical_data = df.select_dtypes(include=[np.number])
    numerical_data.hist(bins=10, figsize=(10, 8))
    plt.suptitle('Histograms of Numerical Variables')
    plt.show()

def plot_boxplots(df):
    # Plot box plots for numerical columns
    numerical_data = df.select_dtypes(include=[np.number])
    
    # Calculate the number of rows and columns needed for the layout
    num_columns = len(numerical_data.columns)
    num_rows = math.ceil(num_columns / 3)  # Use 3 columns per row
    plt.figure(figsize=(15, num_rows * 5))
    numerical_data.plot(kind='box', subplots=True, layout=(num_rows, 3), figsize=(15, num_rows * 5), sharex=False, sharey=False)
    
    plt.suptitle('Box Plots of Numerical Variables')
    plt.show()

def plot_correlation_matrix(df):
    # Plot correlation heatmap
    numerical_data = df.select_dtypes(include=[np.number])
    corr_matrix = numerical_data.corr()

    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Matrix Heatmap')
    plt.show()


def plot_exam_score_by_parental_education(df):
    plt.figure(figsize=(14, 8))
    sns.violinplot(
        data=df, x="Parental_Education_Level", y="Exam_Score", palette="Set3"
    )

    plt.title("Distribution of Exam Scores by Parental Education Level")
    plt.xlabel("Parental Education Level")
    plt.ylabel("Exam Score")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


def plot_categorical_impact(df):
    categorical_factors = [
        "Parental_Involvement",
        "Access_to_Resources",
        "Extracurricular_Activities",
        "Internet_Access",
        "School_Type",
        "Peer_Influence",
        "Learning_Disabilities",
        "Gender",
    ]

    fig, axes = plt.subplots(4, 2, figsize=(20, 25))
    fig.suptitle("Impact of Categorical Factors on Exam Score", fontsize=16)

    for i, factor in enumerate(categorical_factors):
        row = i // 2
        col = i % 2
        sns.boxplot(data=df, x=factor, y="Exam_Score", ax=axes[row, col])
        axes[row, col].set_title(
            f'Exam Score Distribution by {factor.replace("_", " ")}'
        )
        axes[row, col].set_xticklabels(
            axes[row, col].get_xticklabels(), rotation=45, ha="right"
        )

    plt.tight_layout()
    plt.show()


# Plot visual representations
plot_histograms(df)
plot_boxplots(df)
plot_correlation_matrix(df)
plot_exam_score_by_parental_education(df)
plot_categorical_impact(df)

# Introduction

The dataset under analysis, StudentPerformanceFactors.csv, is sourced from an archive and contains various factors affecting student performance. This dataset includes multiple numerical attributes related to student metrics such as grades, study hours, and other performance indicators.

## Key Statistics

The following key statistics were computed for each numerical attribute in the dataset:

- Mean: The average value of the attribute.
- Median: The middle value when the data is ordered.
- Mode: The most frequently occurring value.
- Standard Deviation: A measure of the amount of variation or dispersion in the attribute.
- Variance: The average of the squared differences from the mean.
- Min: The smallest value in the attribute.
- Max: The largest value in the attribute.
- Range: The difference between the maximum and minimum values.
- 25th Percentile: The value below which 25% of the data falls.
- 50th Percentile (Median): The value below which 50% of the data falls.
- 75th Percentile: The value below which 75% of the data falls.

These statistics provide a comprehensive summary of the central tendency, spread, and distribution of the data attributes.

# Insights from Descriptive Statistics

1. Central Tendency:

    - Mean vs. Median: Significant differences between the mean and median in some attributes indicate skewness. For example, if the mean is higher than the median, the data might be right-skewed (positive skew).

2. Dispersion:

    - Standard Deviation and Variance: High values suggest a wide spread in the data, indicating considerable variability in attributes.
    - Range: A large range may suggest significant differences within the data.

3. Percentiles:

    - The 25th, 50th, and 75th percentiles indicate data distribution. A small difference between the 25th and 75th percentiles suggests that the data is tightly clustered around the median.

4. Mode:

    - The mode can reveal the most common values, useful for identifying frequent performance levels or conditions.

5. Outliers:

    - Extreme values that significantly differ from the rest of the data may be potential outliers, warranting further investigation.

# Visualizations and Interpretations

1. Histograms:

    - Histograms for numerical variables reveal the distribution of each attribute. Attributes with bell-shaped histograms suggest normal distribution, while skewed histograms indicate non-normal distribution.

2. Box Plots:

    - Box plots provide a visual summary of the median, quartiles, and potential outliers. Attributes with long whiskers or numerous outliers may require further scrutiny to understand unusual data points.

3. Correlation Matrix Heatmap:

    - The correlation heatmap shows relationships between attributes. High correlation values (close to 1 or -1) suggest strong relationships between pairs of attributes, critical for understanding factor interactions.

4. Exam Scores by Parental Education Level:

    - The violin plot illustrates the distribution of exam scores across different parental education levels. Variations in distribution patterns highlight how parental education may impact student performance.

5. Impact of Categorical Factors on Exam Score:

    - Box plots for categorical factors reveal the distribution of exam scores across various categorical attributes. These visualizations help identify how factors like parental involvement, access to resources, and others affect exam scores.

# Conclusion

The dataset provides a range of numerical attributes related to student performance. The descriptive statistics and visualizations offer insights into central tendencies, variabilities, and relationships within the data. Key findings include the presence of skewness, variability in attributes, and potential outliers. The correlation matrix highlights significant relationships between factors, while additional plots reveal how categorical factors impact exam scores.

### Next Steps:

1. Detailed Analysis: Investigate skewed attributes and outliers in more depth.
2. Correlation Analysis: Explore the implications of strong correlations between attributes for predictive modeling.
3. Further Visualization: Consider additional plots, such as pair plots or 3D scatter plots, to visualize interactions between multiple attributes.
4. Predictive Modeling: Build and evaluate models to predict student performance based on the identified factors.