In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score
import plotly.graph_objects as go

def load_data(file_path):
    # Load the original CSV file and remove the specified columns
    Data = pd.read_csv(file_path).drop(columns=["Subtest"])
    # Replace 'No Dyslexia' with 0, 'Mild' with 1, and 'Severe' with 2
    Data.replace({'No Dyslexia': 0, 'Mild': 1, 'Severe': 2}, inplace=True)
    # Separate the features (X) and the target (y)
    X = Data.iloc[0:25].values.T.astype(float)  # Transpose X to ensure rows are samples and columns are features
    y = Data.iloc[25].values.astype(int)  # Ensure y is an array of integers
    return X, y

def create_pipeline():
    # Define the pipeline with scaling, normalization, PCA, and the classifier
    pipeline = Pipeline([
        ('scaler', StandardScaler()),       # Standardize features by removing the mean and scaling to unit variance
        ('normalizer', Normalizer()),       # Normalize samples individually to unit norm
        ('pca', PCA()),                     # PCA for dimensionality reduction
        ('classifier', RandomForestClassifier(random_state=42))  # RandomForest classifier
    ])
    return pipeline

def ensure_numeric_data(data):
    """
    Ensures all data in the DataFrame is numeric, converting where necessary.

    Parameters:
    data (pd.DataFrame): The DataFrame to convert.

    Returns:
    pd.DataFrame: A DataFrame with all columns converted to numeric.
    """
    for column in data.columns:
        data[column] = pd.to_numeric(data[column], errors='coerce')
    return data

def calculate_overall_percentile(data_point):
    """
    Calculates the overall percentile (mean of all attributes).

    Parameters:
    data_point (np.array): Array of student attributes.

    Returns:
    float: The mean percentile value.
    """
    return np.mean(data_point)

def train_model(X, y):
    # Split the data into training and testing sets using stratified split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    pipeline = create_pipeline()

    # Define the parameter grid
    param_grid = {
        'pca__n_components': [2, 3, 4],
        'classifier__n_estimators': [10, 50, 100]
    }

    # Create the GridSearchCV object with recall as the scoring metric and stratified k-fold cross-validation
    grid_search = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(n_splits=5), scoring='recall_macro', verbose=1)

    # Fit the grid search to the training data
    grid_search.fit(X_train, y_train)

    # Predict on the test set using the best estimator
    best_pipeline = grid_search.best_estimator_

    # Predict on the test set using the refitted best estimator
    y_pred = best_pipeline.predict(X_test)

    # Evaluate the model using recall
    recall = recall_score(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred)

    # Extract feature importance from the best estimator if it has the attribute
    if hasattr(best_pipeline.named_steps['classifier'], 'feature_importances_'):
        feature_importance = best_pipeline.named_steps['classifier'].feature_importances_

    return best_pipeline, X_train, X_test, y_train, y_test



def predict_student_class(pipeline, student_data, labels):
    """
    Predicts the class for a new student using the provided pipeline.

    Parameters:
    pipeline: The trained model pipeline for predictions.
    student_data (np.array): The student data for prediction.
    labels (dict): Mapping of class indices to descriptive labels.

    Returns:
    str: The predicted class label.
    """
    prediction = pipeline.predict(student_data)
    return labels[prediction[0]]

def identify_best_worst_attributes(data_point, attribute_names):
    """
    Identifies the best and worst attributes based on their values.

    Parameters:
    data_point (np.array): Array of student attributes.
    attribute_names (list): List of attribute names.

    Returns:
    tuple: Best attribute name, worst attribute name, best attribute index, worst attribute index.
    """
    best_index = np.argmax(data_point)
    worst_index = np.argmin(data_point)
    return attribute_names[best_index], attribute_names[worst_index], best_index, worst_index

def get_attribute_description(attribute, descriptions, best=True):
    """
    Retrieves the description of an attribute based on its performance.

    Parameters:
    attribute (str): The attribute name.
    descriptions (dict): Dictionary of attribute descriptions.
    best (bool): Whether to get the positive (best) or negative (worst) description.

    Returns:
    str: The description for the attribute.
    """
    return descriptions[attribute]['positive'] if best else descriptions[attribute]['negative']

def get_color(value):
    """
    Determines the color based on the percentile value.

    Parameters:
    value (float): The percentile value.

    Returns:
    str: The RGB color string.
    """
    if value < 50:
        return f"rgb({255}, {int((value / 50) * 255)}, 0)"
    else:
        return f"rgb({int((1 - (value - 50) / 50) * 255)}, 255, 0)"

def create_percentile_gauge(title, value, domain):
    """
    Creates a gauge for displaying a percentile value.

    Parameters:
    title (str): The title of the gauge.
    value (float): The value to display on the gauge.
    domain (dict): The domain for positioning the gauge.

    Returns:
    go.Indicator: A Plotly Indicator for the gauge.
    """
    gradient_steps = [{'range': [i, i + 1], 'color': get_color(i)} for i in range(100)]

    return go.Indicator(
        mode="gauge+number",
        value=value,
        title={'text': title},
        gauge={
            'axis': {'range': [0, 100]},
            'bar': {'color': get_color(value)},
            'steps': gradient_steps,
            'bordercolor': 'black',      # Add this line for border color
            'borderwidth': 2            # Add this line for border width
        },
        domain=domain
    )

def plot_student_performance(overall_percentile, best_attribute, best_percentile, worst_attribute, worst_percentile, best_desc, worst_desc, prediction_label):
    """
    Plots the student's performance summary using gauges.

    Parameters:
    overall_percentile (float): Overall percentile score.
    best_attribute (str): Name of the best attribute.
    best_percentile (float): Percentile score of the best attribute.
    worst_attribute (str): Name of the worst attribute.
    worst_percentile (float): Percentile score of the worst attribute.
    best_desc (str): Description of the best attribute.
    worst_desc (str): Description of the worst attribute.
    prediction_label (str): Predicted class label.
    """
    fig = go.Figure()

    # Overall Percentile Gauge
    fig.add_trace(create_percentile_gauge("Overall Percentile", overall_percentile, {'x': [0, 0.4], 'y': [0.5, 1]}))

    # Best Attribute Gauge
    fig.add_trace(create_percentile_gauge(f"Best Attribute: {best_attribute}", best_percentile, {'x': [0.6, 1], 'y': [0.5, 1]}))

    # Worst Attribute Gauge
    fig.add_trace(create_percentile_gauge(f"Worst Attribute: {worst_attribute}", worst_percentile, {'x': [0.3, 0.7], 'y': [0, 0.4]}))

    fig.add_annotation(
        x=1.03, y=0.6,
        text=best_desc,
        showarrow=False,
        font=dict(size=12),
        align="center"
    )

    fig.add_annotation(
        x=0.5, y=0.03,
        text=worst_desc,
        showarrow=False,
        font=dict(size=12),
        align="center"
    )

    fig.update_layout(
        title='Student Performance Summary',
        autosize=False,
        width=1000,
        height=800,
        margin=dict(l=50, r=50, b=50, t=50),
    )
    fig.add_annotation(
        x=0.001, y=1.01,
        text=f"<b>Dyslexia Diagnosis:</b> {prediction_label}",
        showarrow=False,
        font=dict(size=14, color="black"),
        align="center",
        xref="paper",
        yref="paper",
        bordercolor="black",
        borderwidth=1,
        borderpad=10,
        bgcolor="lightyellow",
        opacity=0.8
    )
    fig.show()

def main():
    file_path = r"C:\Users\antho\OneDrive - University College Dublin\ACM20030 1\ALPACA\MASTER_DATA.csv"
    
    # Load and preprocess data
    X, y = load_data(file_path)
    Data = pd.DataFrame(X)
    ensure_numeric_data(Data)

    # Train the model
    best_pipeline, X_train, X_test, y_train, y_test = train_model(X, y)

    # Example new data point for prediction
    new_student = np.array([[10, 20, 30, 40, 50, 60, 70, 80, 90, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 90, 80, 70, 60, 50, 40]])

    # Mapping of prediction to descriptive labels
    dyslexia_labels = {0: 'No Dyslexia', 1: 'Mild Dyslexia', 2: 'Severe Dyslexia'}

    # Predict the class for the new student
    prediction_label = predict_student_class(best_pipeline, new_student, dyslexia_labels)

    # Calculate the overall percentile
    overall_percentile = calculate_overall_percentile(new_student)

    # Attribute names
    attribute_names = [
        'Similarities', 'Vocabulary', 'Comprehension', 'Block Design', 'Visual Puzzles',
        'Matrix Reasoning', 'Figure Weights', 'Digit Span', 'Picture Span', 'Coding',
        'Symbol Search', 'Phonological Processing', 'Word Decoding', 'Reading Fluency',
        'Reading Comprehension', 'Expressive Vocabulary', 'Receptive Vocabulary',
        'Teacher Reports', 'Behavioral Observations'
    ]

    # Define attribute descriptions and their implications
    attribute_descriptions = {
        'Similarities': {
            'description': 'Measures verbal reasoning and concept formation.',
            'positive': 'Strong skills in verbal reasoning and forming concepts.',
            'negative': 'Challenges in verbal reasoning and forming concepts.'
        },
        'Vocabulary': {
            'description': 'Assesses word knowledge and verbal concept formation.',
            'positive': 'Rich vocabulary and strong word knowledge.',
            'negative': 'Limited vocabulary and struggles with word knowledge.'
        },
        'Comprehension': {
            'description': 'Evaluates understanding of social situations and common sense.',
            'positive': 'Good understanding of social situations and common sense.',
            'negative': 'Difficulty understanding social situations and common sense.'
        },
        'Block Design': {
            'description': 'Tests spatial visualization and problem-solving skills.',
            'positive': 'Strong spatial visualization and problem-solving skills.',
            'negative': 'Challenges with spatial visualization and problem-solving.'
        },
        'Visual Puzzles': {
            'description': 'Measures nonverbal reasoning and problem-solving ability.',
            'positive': 'Excellent nonverbal reasoning and problem-solving abilities.',
            'negative': 'Struggles with nonverbal reasoning and problem-solving.'
        },
        'Matrix Reasoning': {
            'description': 'Assesses visual-spatial reasoning and fluid intelligence.',
            'positive': 'Strong visual-spatial reasoning and fluid intelligence.',
            'negative': 'Difficulty with visual-spatial reasoning and fluid intelligence.'
        },
        'Figure Weights': {
            'description': 'Tests quantitative reasoning and visual-perceptual skills.',
            'positive': 'Good quantitative reasoning and visual-perceptual skills.',
            'negative': 'Challenges in quantitative reasoning and visual-perceptual skills.'
        },
        'Digit Span': {
            'description': 'Measures attention, concentration, and mental control.',
            'positive': 'Strong attention, concentration, and mental control.',
            'negative': 'Difficulty with attention, concentration, and mental control.'
        },
        'Picture Span': {
            'description': 'Evaluates visual working memory.',
            'positive': 'Strong visual working memory.',
            'negative': 'Weak visual working memory.'
        },
        'Coding': {
            'description': 'Tests processing speed and short-term visual memory.',
            'positive': 'Fast processing speed and good short-term visual memory.',
            'negative': 'Slow processing speed and poor short-term visual memory.'
        },
        'Symbol Search': {
            'description': 'Measures processing speed and visual-motor coordination.',
            'positive': 'Strong processing speed and visual-motor coordination.',
            'negative': 'Challenges with processing speed and visual-motor coordination.'
        },
        'Phonological Processing': {
            'description': 'Assesses phonemic awareness, a key skill for reading.',
            'positive': 'Good phonemic awareness, aiding in reading skills.',
            'negative': 'Weak phonemic awareness, hindering reading skills.'
        },
        'Word Decoding': {
            'description': 'Tests ability to decode written words, essential for reading.',
            'positive': 'Strong word decoding skills, essential for reading.',
            'negative': 'Poor word decoding skills, affecting reading ability.'
        },
        'Reading Fluency': {
            'description': 'Measures speed and accuracy of reading aloud.',
            'positive': 'Fluent and accurate reading aloud.',
            'negative': 'Slow and inaccurate reading aloud.'
        },
        'Reading Comprehension': {
            'description': 'Assesses understanding of written text.',
            'positive': 'Good understanding of written text.',
            'negative': 'Difficulty understanding written text.'
        },
        'Expressive Vocabulary': {
            'description': 'Evaluates ability to use words to convey meaning.',
            'positive': 'Strong ability to use words effectively.',
            'negative': 'Challenges in using words to convey meaning.'
        },
        'Receptive Vocabulary': {
            'description': 'Measures understanding of spoken words.',
            'positive': 'Good understanding of spoken words.',
            'negative': 'Poor understanding of spoken words.'
        },
        'Teacher Reports': {
            'description': 'Observations from teachers regarding student behavior and performance.',
            'positive': 'Positive teacher reports indicating good behavior and performance.',
            'negative': 'Negative teacher reports indicating issues with behavior and performance.'
        },
        'Behavioral Observations': {
            'description': 'General behavioral observations during testing.',
            'positive': 'Positive behavioral observations indicating good behavior and focus.',
            'negative': 'Negative behavioral observations indicating issues with behavior and focus.'
        }
    }

    # Identify the best and worst attributes
    best_attribute, worst_attribute, best_index, worst_index = identify_best_worst_attributes(new_student[0], attribute_names)

    # Calculate the percentile rank for the best and worst attributes
    best_attribute_percentile = new_student[0, best_index]
    worst_attribute_percentile = new_student[0, worst_index]

    # Best and worst attribute descriptions
    best_attribute_description = get_attribute_description(best_attribute, attribute_descriptions, best=True)
    worst_attribute_description = get_attribute_description(worst_attribute, attribute_descriptions, best=False)


    # Plot student performance
    plot_student_performance(
        overall_percentile, 
        best_attribute, 
        best_attribute_percentile, 
        worst_attribute, 
        worst_attribute_percentile, 
        best_attribute_description, 
        worst_attribute_description, 
        prediction_label
    )

# Call the main function to execute the script
if __name__ == "__main__":
    main()


Fitting 5 folds for each of 9 candidates, totalling 45 fits
