In [1]:
# Data Processing
import pandas as pd
import numpy as np
# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
import pandas as pd
from scipy.stats import ttest_ind

def performFeatureSelect(grp1_name, grp2_name, df_group1, df_group2):
    # Significance level (alpha)
    alpha = 0.05

    # Extract numeric column names
    numeric_columns_No = df_group1.columns[1:]
    numeric_columns_water = df_group2.columns[1:]

    # Prepare result matrix
    result_matrix = []

    # Perform t-test for each pair of numeric columns
    for col_group1, col_group2 in zip(numeric_columns_No, numeric_columns_water):
        stat, p_value = ttest_ind(
            df_group1[col_group1], df_group2[col_group2], nan_policy='omit')

        # Prepare result dictionary
        result_dict = {
            "Group_1": grp1_name,
            "Group_2": grp2_name,
            f"{grp1_name}_Group_1": col_group1,
            f"{grp2_name}Column_Group_2": col_group2,
            "T-Statistic": stat,
            "P-Value": p_value,
            "Significant Difference": "Yes" if p_value < alpha else "No"
        }

        result_matrix.append(result_dict)

    # Create DataFrame from result matrix
    result_df = pd.DataFrame(result_matrix)

    # Save DataFrame to CSV file
    result_df.to_csv(
        f"./Step-6 Feature Ttest/Speech/{grp1_name}_{grp2_name}_ttest.csv", index=False)


In [3]:
import pandas as pd

path = "./Step-5 Combined CSV/Speech_data.csv"
data = pd.read_csv(path)

tests = data['Output'].unique()
print(tests)

for i in range(0, len(tests)):
    group1 = tests[i]  # (i.e) Silent
    for j in range(1, len(tests)):
        if j > i:
            group2 = tests[j]  # (i.e) Water

            df1 = data.copy()
            df1 = df1[df1['Output'].str.contains(group1)]
            del df1["Output"]

            df2 = data.copy()
            df2 = df2[df2['Output'].str.contains(group2)]
            del df2["Output"]
            performFeatureSelect(group1, group2, df1, df2)
print("T-test Completed")

['Doctor' 'Move' 'No' 'Pain' 'Silent' 'Toilet' 'Water' 'Yes']
T-test Completed


In [4]:
import os

path = "./Step-6 Feature Ttest/Speech/"
for folders in os.listdir(path):
    df = pd.read_csv(os.path.join(path, folders))
    name = folders.split('_')
    no_rows = df[df['Significant Difference'] == 'No']
    print(f"{name[0]} -> {name[1]} : ",
          no_rows[f'{name[0]}_Group_1'].to_list())

Doctor -> Move :  ['Beta_AF8']
Doctor -> No :  ['Alpha_AF7', 'Gamma_TP10']
Doctor -> Pain :  []
Doctor -> Silent :  []
Doctor -> Toilet :  []
Doctor -> Water :  ['Theta_TP9', 'Theta_AF7', 'Beta_AF8', 'Gamma_AF8']
Doctor -> Yes :  ['Theta_TP9', 'Alpha_TP9', 'Gamma_TP10']
Move -> No :  ['Alpha_TP10', 'Beta_AF7', 'Beta_TP10', 'Gamma_AF7', 'Gamma_AF8']
Move -> Pain :  ['Theta_TP9', 'Alpha_AF8', 'Alpha_TP10', 'Gamma_TP10']
Move -> Silent :  ['Alpha_AF7']
Move -> Toilet :  ['Beta_TP9', 'Beta_AF7', 'Beta_AF8', 'Gamma_TP9', 'Gamma_AF8']
Move -> Water :  ['Beta_AF8', 'Beta_TP10', 'Gamma_AF7']
Move -> Yes :  ['Delta_TP10', 'Theta_AF8', 'Theta_TP10', 'Alpha_TP10', 'Beta_AF7', 'Beta_TP10', 'Gamma_AF7']
No -> Pain :  ['Alpha_AF8', 'Alpha_TP10', 'Beta_AF7', 'Gamma_TP9']
No -> Silent :  ['Theta_AF7']
No -> Toilet :  ['Theta_AF7', 'Alpha_AF8', 'Beta_TP10', 'Gamma_AF8', 'Gamma_TP10']
No -> Water :  ['Theta_TP10', 'Alpha_TP10', 'Beta_TP9', 'Beta_TP10', 'Gamma_AF7']
No -> Yes :  ['Alpha_AF8', 'Alpha_TP10

In [6]:
import os
import pandas as pd

associations =  {}

path = "./Step-6 Feature Ttest/Speech/"
for folders in os.listdir(path):
    if folders.endswith('.csv'):  # Check if the file is a CSV file
        df = pd.read_csv(os.path.join(path, folders))
        name = folders.split('_')
        if name[0] not in associations:
            associations[name[0]] = {}
        no_rows = df[df['Significant Difference'] == 'No']
        associations[name[0]][name[1]] = no_rows[name[0] + '_Group_1'].tolist()

In [7]:
# Generate model based on important features on t-test data
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from joblib import dump

output_list = ['Move', 'Silent', 'Water', 'Yes', 'No', 'Toilet', 'Doctor', 'Pain']

def generate_model(grp1, grp2):
    df = pd.read_csv(f"./Step-5 Combined CSV/Speech_data.csv")

    for i in output_list:
        if i != grp1 and i != grp2:
            df = df[~df['Output'].str.contains(i)]
            
    # remove features
    for features in associations[grp1][grp2]:
        del df[features]

    # Extract features and target variable
    X = df.drop('Output', axis=1)
    y = df['Output']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the classifier
    rf_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf_classifier.predict(X_test)

    # Evaluate the performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {grp1}-{grp2}: {accuracy:.4f}')

    # Save the trained model to a file
    model_filename = f"./Step-7 Model Generation/Speech/{grp1}_{grp2}_randomforest_model.joblib"
    dump(rf_classifier, model_filename)

    # print(f"Model saved as {model_filename}")

In [8]:
# start generate model here based on association
for key, value in associations.items():
    for inkey, inval in value.items():
        generate_model(key, inkey)

Accuracy for Doctor-Move: 0.9511
Accuracy for Doctor-No: 0.9414
Accuracy for Doctor-Pain: 0.9379
Accuracy for Doctor-Silent: 0.9681
Accuracy for Doctor-Toilet: 0.9393
Accuracy for Doctor-Water: 0.9173
Accuracy for Doctor-Yes: 0.9147
Accuracy for Move-No: 0.8928
Accuracy for Move-Pain: 0.9539
Accuracy for Move-Silent: 0.9621
Accuracy for Move-Toilet: 0.9043
Accuracy for Move-Water: 0.9320
Accuracy for Move-Yes: 0.9062
Accuracy for No-Pain: 0.9370
Accuracy for No-Silent: 0.9502
Accuracy for No-Toilet: 0.8863
Accuracy for No-Water: 0.9001
Accuracy for No-Yes: 0.8293
Accuracy for Pain-Silent: 0.9640
Accuracy for Pain-Toilet: 0.9393
Accuracy for Pain-Water: 0.9280
Accuracy for Pain-Yes: 0.9291
Accuracy for Silent-Toilet: 0.9520
Accuracy for Silent-Water: 0.9497
Accuracy for Silent-Yes: 0.9537
Accuracy for Toilet-Water: 0.9213
Accuracy for Toilet-Yes: 0.9185
Accuracy for Water-Yes: 0.8950


In [10]:
from joblib import load
import numpy as np
import pandas as pd
from collections import Counter

model_filename = f'./Step-7 Model Generation/Speech/Pain_Water_randomforest_model.joblib'
# Load the saved model
loaded_model = load(model_filename)

# preprocess test dataset
new_data = pd.read_csv("sushant_new_water.csv")
# new_data = new_data.iloc[15:-15]
new_data.reset_index(drop=True, inplace=True)
new_data.replace([np.inf, -np.inf], np.nan, inplace=True)
new_data.dropna(axis=0, inplace=True)
new_data = new_data[~(new_data == 0.0).any(axis=1)]
new_data.drop_duplicates(inplace=True)

del new_data['Delta_AF7']
del new_data['Delta_TP10']
del new_data['Alpha_TP9']

for col in new_data.select_dtypes(include=['float64']).columns:
    new_data[col] = new_data[col].map(lambda x: f'{x:.6f}')

# Make predictions using the loaded model
predictions = loaded_model.predict(new_data)

# Count occurrences of each class in predictions
class_counts = Counter(predictions)
# Sort the counts in descending order
sorted_counts = class_counts.most_common()
print(sorted_counts)

[('Water', 82), ('Pain', 2)]
