In [None]:
import pandas as pd

dataset: https://www.kaggle.com/datasets/devansodariya/student-performance-data

In [None]:
data = pd.read_csv("../../data/day_1/kaggle_mock_data.csv", sep=";")

In [None]:
data

# Überprüfen der Daten auf Richtigkeit und Vollständigkeit

In [None]:
print(f"Dataset shape: {data.shape}")
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")

In [None]:
# Check for duplicate rows
duplicates = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

In [None]:
null_counts = data.isnull().sum()
print(null_counts)

In [None]:
# Check if values in specified columns are in expected ranges
grade_cols = ['G1', 'G2', 'G3']
for col in grade_cols:
    min_val = data[col].min()
    max_val = data[col].max()
    print(f"\nGrade {col}: Range from {min_val} to {max_val}")
    if min_val < 0 or max_val > 20:
        print(f"Warning: {col} has values outside the expected range of 0-20")

In [None]:
data.dtypes

In [None]:
# Calculate the distribution of values in percent for each column
distribution_percent = {}
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns
anomalies = []  # List to store anomalies (column, value, percentage)

# Handle categorical columns
for col in categorical_cols:
    distribution = data[col].value_counts(normalize=True) * 100
    distribution_percent[col] = distribution
    
    # Check for anomalies (values with < 5% occurrence)
    for value, percentage in distribution.items():
        if percentage < 3.0:
            anomalies.append((col, value, percentage))

# Handle numeric columns
for col in numeric_cols:
    distribution = data[col].value_counts(normalize=True) * 100
    distribution_percent[col] = distribution
    
    # Check for anomalies (values with < 5% occurrence)
    for value, percentage in distribution.items():
        if percentage < 3.0:
            anomalies.append((col, value, percentage))

# Display the distribution
# for column, distribution in distribution_percent.items():
#     print(f"\nDistribution for column '{column}':")
#     print(distribution)

# Display anomalies
print("\nAnomalies (values with < 3% occurrence):")
print(f"Total anomalies found: {len(anomalies)}")
print("Format: (column, value, percentage)")
for anomaly in anomalies:  # Show first 10 anomalies
    print(anomaly)

use data["feature_name"].value_counts() to investigate on strange looking features

In [None]:
# Summary of findings
print("\nData Quality Summary:")
print(f"- Total rows: {data.shape[0]}")
print(f"- Missing values: {data.isnull().any().sum()} columns with missing values")
print(f"- Duplicate rows: {duplicates}")

# Analysieren der Zielvariable

In [None]:
sorted(list(set(data.G1)))

In [None]:
sorted(list(set(data.G2)))

In [None]:
sorted(list(set(data.G3)))

In [None]:
print("G1 mean: ", data.G1.mean())
print("G2 mean: ", data.G2.mean())
print("G3 mean: ", data.G3.mean())

In [None]:
print("G1 median: ", data.G1.median())
print("G2 median: ", data.G2.median())
print("G3 median: ", data.G3.median())

In [None]:
print("G1 standard deviation: ", data.G1.std())
print("G2 standard deviation: ", data.G2.std())
print("G3 standard deviation: ", data.G3.std())

Skewness measures the asymmetry of a distribution's probability distribution:

Positive skew: When the right tail is longer (data is concentrated on the left)

Zero skew: Symmetric distribution

Negative skew: When the left tail is longer (data is concentrated on the right)

In [None]:
print("G1 skewness: ", data.G1.skew())
print("G2 skewness: ", data.G2.skew())
print("G3 skewness: ", data.G3.skew())

Kurtosis measures the "tailedness" of a distribution:

Higher kurtosis: Heavy tails, more outliers (leptokurtic)

Normal distribution: Kurtosis of 3 (mesokurtic) --> pandas already calculates kurtosis -3 --> for pandas neutral would be a value of 0

Lower kurtosis: Light tails, fewer outliers (platykurtic)

In [None]:
print("G1 kurtosis: ", data.G1.kurt())
print("G2 kurtosis: ", data.G2.kurt())
print("G3 kurtosis: ", data.G3.kurt())

In [None]:
g1_counts = data['G1'].value_counts()
g2_counts = data['G2'].value_counts()
g3_counts = data['G3'].value_counts()

print("G1 Value Counts:\n", g1_counts)

In [None]:
print("\nG2 Value Counts:\n", g2_counts)

In [None]:
print("\nG3 Value Counts:\n", g3_counts)

# Analysieren der Features

### kategorisch

In [None]:
# Check for unique values in categorical columns
print("\nUnique values in categorical columns:")
for col in categorical_cols:
    print(f"{col}: {data[col].nunique()} unique values - {sorted(data[col].unique())}")

### numerisch

In [None]:
# Check numeric columns for outliers and distributions
print("\nStatistics for numeric columns:")
print(data[numeric_cols].describe().T)

In [None]:
# 1. Analyze categorical features
categorical_analysis = {}
for col in categorical_cols:
    # Get value counts and percentages
    counts = data[col].value_counts()
    percentages = data[col].value_counts(normalize=True) * 100
    categorical_analysis[col] = {
        'counts': counts,
        'unique_values': len(counts),
        'most_common': counts.index[0],
        'most_common_pct': percentages.iloc[0],
        'least_common': counts.index[-1],
        'least_common_pct': percentages.iloc[-1]
    }

# Display summary of categorical variables
print("CATEGORICAL FEATURES ANALYSIS:")
print("-" * 80)
for col, stats in categorical_analysis.items():
    print(f"{col}:")
    print(f"  - Unique values: {stats['unique_values']}")
    print(f"  - Most common: '{stats['most_common']}' ({stats['most_common_pct']:.2f}%)")
    print(f"  - Least common: '{stats['least_common']}' ({stats['least_common_pct']:.2f}%)")
    print("-" * 40)

In [None]:
# 2. Analyze numeric features
numeric_analysis = {}
for col in numeric_cols:
    if col not in grade_cols:  # Exclude target variables (G1, G2, G3)
        numeric_analysis[col] = {
            'min': data[col].min(),
            'max': data[col].max(),
            'mean': data[col].mean(),
            'median': data[col].median(),
            'std': data[col].std(),
            'skew': data[col].skew(),
            'kurt': data[col].kurt(),
            'unique_values': data[col].nunique()
        }

# Display summary of numeric variables
print("\nNUMERIC FEATURES ANALYSIS:")
print("-" * 80)
for col, stats in numeric_analysis.items():
    print(f"{col}:")
    print(f"  - Range: {stats['min']} to {stats['max']} (span: {stats['max']-stats['min']})")
    print(f"  - Central tendency: mean={stats['mean']:.2f}, median={stats['median']}")
    print(f"  - Dispersion: std={stats['std']:.2f}")
    print(f"  - Distribution: skewness={stats['skew']:.2f}, kurtosis={stats['kurt']:.2f}")
    print(f"  - Unique values: {stats['unique_values']}")
    print("-" * 40)

### Alternative für Basis Statistiken der numerischen Features:

In [None]:
# Check numeric columns for outliers and distributions
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
print("\nStatistics for numeric columns:")
print(data[numeric_cols].describe().T)

# Suche nach Korrelation zwischen Zielvariable und Features

In [None]:
# Separate features and targets
targets = ['G1', 'G2', 'G3']

# Convert categorical variables to numeric using one-hot encoding
data_encoded = pd.get_dummies(data)

# Calculate correlations
correlation_matrix = data_encoded.corr()

# Extract correlations with target variables
correlations_with_targets = {}
for target in targets:
    # Sort correlations by absolute value
    correlations = correlation_matrix[target].sort_values(ascending=False)
    correlations_with_targets[target] = correlations

# Display top correlations for each target
for target in targets:
    print(f"\n=== Top 10 Correlations with {target} ===")
    print(correlations_with_targets[target][:10])
    print(f"\n=== Bottom 10 Correlations with {target} ===")
    print(correlations_with_targets[target][-10:])

target_corr = correlation_matrix[targets].drop(targets)
top_corr_features = target_corr.abs().mean(axis=1).sort_values(ascending=False)[:15].index 
print("Top correlation features: ", top_corr_features)

# Suche nach Mustern in den Features

In [None]:
# Create a dataframe for our analysis
data_copy = data.copy()

# Function to test feature combinations for stronger correlation than individual features
def test_feature_combination(df, feature1, feature2, target='G3'):
    """
    Test if a combination of features has stronger correlation with target
    than the individual features alone.
    """
    # Create the combined feature
    if isinstance(feature1, tuple):
        # If feature1 is already a condition
        condition1 = df.eval(feature1[0])
        name1 = feature1[1]
    else:
        # Create condition based on feature type
        if df[feature1].dtype == 'object':
            condition1 = df[feature1] == df[feature1].value_counts().index[0]
            name1 = f"{feature1}_{df[feature1].value_counts().index[0]}"
        else:
            threshold1 = df[feature1].median()
            condition1 = df[feature1] > threshold1
            name1 = f"{feature1}>{threshold1}"
    
    if isinstance(feature2, tuple):
        # If feature2 is already a condition
        condition2 = df.eval(feature2[0])
        name2 = feature2[1]
    else:
        # Create condition based on feature type
        if df[feature2].dtype == 'object':
            condition2 = df[feature2] == df[feature2].value_counts().index[0]
            name2 = f"{feature2}_{df[feature2].value_counts().index[0]}"
        else:
            threshold2 = df[feature2].median()
            condition2 = df[feature2] > threshold2
            name2 = f"{feature2}>{threshold2}"
    
    # Create the combined feature
    df[f"{name1}_AND_{name2}"] = condition1 & condition2
    
    # Calculate correlations
    corr1 = df[target].corr(condition1.astype(int))
    corr2 = df[target].corr(condition2.astype(int))
    corr_combined = df[target].corr(df[f"{name1}_AND_{name2}"].astype(int))
    
    # Calculate if combined correlation is stronger than individual ones
    is_stronger = abs(corr_combined) > max(abs(corr1), abs(corr2))
    
    # Drop the temporary column
    df.drop(f"{name1}_AND_{name2}", axis=1, inplace=True)
    
    return {
        'feature1': name1, 
        'feature2': name2,
        'corr1': corr1, 
        'corr2': corr2, 
        'corr_combined': corr_combined,
        'is_stronger': is_stronger
    }

# Define feature groups to test
educational_features = ['studytime', 'failures', ('higher == "yes"', 'higher_yes')]
family_features = ['Medu', 'Fedu', 'famrel', ('famsup == "yes"', 'famsup_yes')]
lifestyle_features = [('internet == "yes"', 'internet_yes'), 'goout', 'Dalc', 'Walc', 'studytime']
school_features = [('school == "GP"', 'school_GP'), ('address == "U"', 'address_U'), 
                  ('schoolsup == "yes"', 'schoolsup_yes'), ('higher == "yes"', 'higher_yes')]
negative_indicators = ['failures', 'absences', 'Dalc', 'Walc', 'goout', 'age']
gender_patterns = [('sex == "F"', 'sex_F'), ('romantic == "no"', 'romantic_no'), 'Dalc', 'Walc']

# Test all combinations within feature groups
feature_groups = {
    'Educational': educational_features,
    'Family': family_features,
    'Lifestyle': lifestyle_features,
    'School': school_features,
    'Negative': negative_indicators,
    'Gender': gender_patterns
}

# Store the results
strong_combinations = []

# Test combinations within each feature group
for group_name, features in feature_groups.items():
    print(f"\n=== Testing {group_name} Feature Combinations ===")
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            result = test_feature_combination(data_copy, features[i], features[j])
            if result['is_stronger']:
                strong_combinations.append(result)
                print(f"Strong combination found: {result['feature1']} AND {result['feature2']}")
                print(f"  Individual correlations: {result['corr1']:.3f}, {result['corr2']:.3f}")
                print(f"  Combined correlation: {result['corr_combined']:.3f}")

# Test selected cross-group combinations
print("\n=== Testing Cross-Group Feature Combinations ===")
cross_combinations = [
    (('failures == 0', 'failures=0'), ('higher == "yes"', 'higher_yes')),
    ('studytime', ('higher == "yes"', 'higher_yes')),
    (('sex == "F"', 'sex_F'), ('romantic == "no"', 'romantic_no')),
    ('Dalc', ('goout < 3', 'low_goout')),
    ('absences', 'failures'),
    ('Medu', 'Fedu')
]

for combo in cross_combinations:
    result = test_feature_combination(data_copy, combo[0], combo[1])
    if result['is_stronger']:
        strong_combinations.append(result)
        print(f"Strong combination found: {result['feature1']} AND {result['feature2']}")
        print(f"  Individual correlations: {result['corr1']:.3f}, {result['corr2']:.3f}")
        print(f"  Combined correlation: {result['corr_combined']:.3f}")

# Test for non-linear patterns
print("\n=== Testing Non-Linear Patterns ===")
nonlinear_features = ['age', 'absences', 'studytime', 'goout', 'Dalc', 'Walc']

for feature in nonlinear_features:
    # Test if medium values are better than extremes
    data_copy[f'{feature}_mid'] = (data_copy[feature] > data_copy[feature].quantile(0.25)) & \
                                  (data_copy[feature] < data_copy[feature].quantile(0.75))
    
    # Test if extremes are better than middle
    data_copy[f'{feature}_extreme'] = (data_copy[feature] <= data_copy[feature].quantile(0.25)) | \
                                      (data_copy[feature] >= data_copy[feature].quantile(0.75))
    
    # Calculate correlations
    corr_normal = data_copy['G3'].corr(data_copy[feature])
    corr_mid = data_copy['G3'].corr(data_copy[f'{feature}_mid'].astype(int))
    corr_extreme = data_copy['G3'].corr(data_copy[f'{feature}_extreme'].astype(int))
    
    # Check if transformation reveals stronger pattern
    if abs(corr_mid) > abs(corr_normal) or abs(corr_extreme) > abs(corr_normal):
        print(f"Non-linear pattern found for {feature}:")
        print(f"  Normal correlation: {corr_normal:.3f}")
        print(f"  Mid-range correlation: {corr_mid:.3f}")
        print(f"  Extremes correlation: {corr_extreme:.3f}")
        
    # Drop temporary columns
    data_copy.drop([f'{feature}_mid', f'{feature}_extreme'], axis=1, inplace=True)

# Count subgroups with extreme performance
print("\n=== Identifying Extreme Performance Groups ===")

# Define extreme performance (top and bottom 15%)
top_threshold = data['G3'].quantile(0.85)
bottom_threshold = data['G3'].quantile(0.15)

# Function to find subgroups with unusual performance
def identify_extreme_groups(df, feature, target='G3'):
    if df[feature].dtype == 'object':
        # For categorical features
        for category in df[feature].unique():
            subset = df[df[feature] == category]
            top_performers = (subset[target] >= top_threshold).mean() * 100
            bottom_performers = (subset[target] <= bottom_threshold).mean() * 100
            
            # Check if this subgroup has substantially different performance
            if top_performers > 25 or bottom_performers > 25:
                print(f"Extreme group found: {feature} = {category}")
                print(f"  % in top performers: {top_performers:.1f}%")
                print(f"  % in bottom performers: {bottom_performers:.1f}%")
                print(f"  Sample size: {len(subset)}")
    else:
        # For numeric features, divide into quartiles
        for i, threshold in enumerate([0.25, 0.5, 0.75]):
            category = f"Q{i+1}"
            subset = df[df[feature] <= df[feature].quantile(threshold)] if i == 0 else \
                    df[(df[feature] > df[feature].quantile(threshold - 0.25)) & 
                       (df[feature] <= df[feature].quantile(threshold))]
            if i == 3:
                subset = df[df[feature] > df[feature].quantile(0.75)]
                
            top_performers = (subset[target] >= top_threshold).mean() * 100
            bottom_performers = (subset[target] <= bottom_threshold).mean() * 100
            
            # Check if this subgroup has substantially different performance
            if top_performers > 25 or bottom_performers > 25:
                print(f"Extreme group found: {feature} {category}")
                print(f"  % in top performers: {top_performers:.1f}%")
                print(f"  % in bottom performers: {bottom_performers:.1f}%")
                print(f"  Sample size: {len(subset)}")

# Check important features for extreme performance groups
important_features = ['failures', 'higher', 'school', 'studytime', 'Medu', 'Fedu', 
                      'Dalc', 'Walc', 'absences', 'sex', 'romantic']

for feature in important_features:
    identify_extreme_groups(data, feature)

# Summary of findings
print("\n=== Summary of Key Patterns ===")
print(f"Total strong feature combinations found: {len(strong_combinations)}")