In [1]:
# import libraries
import os
import datetime

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense

# Helper Functions

In [2]:
def get_dataframe(language):
  # Set the directory path
  directory_path = 'C:\Research\labeled_features\{}'.format(language)

  # Create an empty list to store the dataframes
  dataframes = []

  # Iterate through all the files in the directory
  for file in os.listdir(directory_path):
    # Check if the file is a CSV file
    if file.endswith('.csv'):
      # Read the CSV file into a Pandas dataframe
      df = pd.read_csv(os.path.join(directory_path, file))

      # Drop rows where the 'label' column is not what was expected
      df = df[df['label'].isin([0, 1, 2, 3])]

      # Append the dataframe to the list
      dataframes.append(df)

  # Concatenate all the dataframes into a single dataframe
  df_all = pd.concat(dataframes)

  # Model was predicting negative values, so I had to remove the negative values
  df_all = df_all[df_all['label'].isin([0, 1, 2, 3])]

  # Add column with the index of each row, reset the indices
  df_all = df_all.reset_index(drop=True)

  # Create a list where at each index is the sample_id of that row
  sample_ids = df_all['sample_id'].tolist()

  # Drop the 'sample_id' column
  return pd.DataFrame(df_all.drop(columns=['sample_id'])), sample_ids

In [3]:
def get_dataframes_knn(language):
  # Set the directory path
  directory_path = 'C:\Research\labeled_features\{}'.format(language)

  # Create an empty list to store the train dataframes
  train_set = []

  # Create an empty list to store the test dataframes
  test = []

  # Number of files in directory
  num_files = len(os.listdir(directory_path))

  # Index of the last file that is lower than 80% of the total number of files rounded down
  first_train_file = int(np.floor(num_files * 0.8))

  # Iterate through all the files in the directory
  for i, file in enumerate(os.listdir(directory_path)):
    # Check if the file is a CSV file
    if file.endswith('.csv'):
      # Read the CSV file into a Pandas dataframe
      df = pd.read_csv(os.path.join(directory_path, file))

      # Drop rows where the 'label' column is not what was expected
      df = df[df['label'].isin([0, 1, 2, 3])]

      # Append the dataframe to the corresponding list
      if i < first_train_file:
        train_set.append(df)
      else:
        test.append(df)

  # Concatenate the test dataframes into a single dataframe
  df_test = pd.concat(test).drop(columns=['sample_id'])

  # Concatenate the train dataframes into a single dataframe
  df_train = pd.concat(train_set).drop(columns=['sample_id'])

  # Split the train dataframe into X and y
  X = df_train.iloc[:, :-1]
  y = df_train['label']

  # Split the test dataframe into X and y
  X_test = df_test.iloc[:, :-1]
  y_test = df_test['label']

  return X, y, X_test, y_test

In [4]:
def get_dataframes_divided(language):
  # Set the directory path
  directory_path = 'C:\Research\labeled_features\{}'.format(language)

  # Create an empty list to store the train dataframes
  train_set = []

  # Create an empty list to store the test dataframes
  test = []

  # Number of files in directory
  num_files = len(os.listdir(directory_path))

  # Index of the last file that is lower than 80% of the total number of files rounded down
  first_train_file = int(np.floor(num_files * 0.8))

  # Iterate through all the files in the directory
  for i, file in enumerate(os.listdir(directory_path)):
    # Check if the file is a CSV file
    if file.endswith('.csv'):
      # Read the CSV file into a Pandas dataframe
      df = pd.read_csv(os.path.join(directory_path, file))

      # Drop rows where the 'label' column is not what was expected
      df = df[df['label'].isin([0, 1, 2, 3])]

      # Append the dataframe to the corresponding list
      if i < first_train_file:
        train_set.append(df)
      else:
        test.append(df)

  # Concatenate the test dataframes into a single dataframe
  df_test = pd.concat(test).drop(columns=['sample_id'])

  # Concatenate the train dataframes into a single dataframe
  df_train = pd.concat(train_set).drop(columns=['sample_id'])

  # Split the train dataframe into X and y
  X = df_train.iloc[:, :-1]
  y = df_train['label']

  # Split the test dataframe into X and y
  X_test = df_test.iloc[:, :-1]
  y_test = df_test['label']

  return X, y, X_test, y_test


In [5]:
# # Set dataframe x to include only the features at the specified indices by list 'indices'
# X = pd.concat([df_all_backup.iloc[:, 5:10], df_all_backup.iloc[:, 14], df_all_backup.iloc[:, 54]], axis=1)
# X

In [6]:
def create_model(df_all):
    # Split the data into features and target
    X = df_all.iloc[:, :-1]
    df_all_backup = df_all
    y = df_all['label']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a linear regression model
    model = LinearRegression()

    # Fit the model to the training data
    model.fit(X_train, y_train)

    return model, X_train, y_train, X_test, y_test

In [7]:
def get_predictions(model, X_test):
    # Make predictions on the testing data
    unrounded_predictions = model.predict(X_test)

    # Round the predictions to the nearest integer
    predictions = np.round(unrounded_predictions)

    # Make all negative predictions 0
    predictions[predictions < 0] = 0

    return unrounded_predictions, predictions

In [8]:
def plot_conf_matrix(conf_matrix, language_model, language_test):
    # Convert the confusion matrix to a Pandas dataframe
    cm_df = pd.DataFrame(conf_matrix)#, index=['true 0', 'true 1', 'true 2', 'true 3'], columns=['pred 0', 'pred 1', 'pred 2', 'pred 3'])

    # Create a heatmap of the confusion matrix
    fig = px.imshow(cm_df, title='Confusion Matrix', text_auto=True)
    fig.update_layout(title='Confusion Matrix Predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Predicted', yaxis_title='Expected')
    fig.show()

In [9]:
def plot_correlations(df_all, language_model, language_test):
    # Calculate the correlations between the columns and the label
    correlations = df_all.corr()['label'].iloc[:-1]

    # Create a scatter plot of the correlations
    fig = px.scatter(x=correlations.index, y=correlations, title='Correlations')
    fig.update_layout(title='Correlations between features and label, predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Feature', yaxis_title='Correlation')
    fig.show()

    # Create a dataframe from the correlations
    correlations_df = pd.DataFrame(correlations)

    # correlations_df.style.background_gradient(cmap ='viridis')\
    # .set_properties(**{'font-size': '20px'})

In [10]:
def compute_correlation(unrounded_predictions, y_test):
    expected_predicted_df = pd.DataFrame()
    expected_predicted_df['expected'] = y_test
    expected_predicted_df['predicted'] = unrounded_predictions
    corr = expected_predicted_df.corr()['predicted'].iloc[0]
    return expected_predicted_df, corr

In [11]:
def plot_expected_vs_predicted(expected_predicted_df, language_model, language_test):
  fig = go.Figure()

  labels = expected_predicted_df['expected'].unique()

  for label in labels:
    fig.add_trace(go.Violin(
    x=expected_predicted_df['expected'][expected_predicted_df['expected'] == label],
    y=expected_predicted_df['predicted'][expected_predicted_df['expected'] == label],
    name=label,
    box_visible=True,
    meanline_visible=False,
    ))

  fig.update_layout(title='Expected vs Predicted, predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Expected', yaxis_title='Predicted')
  fig.show()

In [12]:
def save_coefficients(model, X, corr, language):

    #Switch case based on the type of the model
    if type(model) == LinearRegression:
        model_type = 'linear_regression'
    # elif type(model) == sklearn.neighbors._classification.KNeighborsClassifier:
    #     model_type = 'knn_classifier'
        

    # Create a dataframe with the the coefficients and the intercept
    coefficients_df = pd.DataFrame()
    coefficients_df['feature'] = X.columns
    coefficients_df['coefficient'] = model.coef_

    # Add row in column 'feature' for the intercept and set the value to 'intercept'
    coefficients_df.loc[-1] = ['intercept', model.intercept_]

    # Add row for the correlation between the label and the features
    coefficients_df.loc[-2] = ['correlation', corr]

    filename = 'C:\Research\Results\{lang}\{model_type}coefficients_{date}_corr_{corr}.csv'.format(model_type=model_type, lang=language, date=datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), corr=corr)

    # Write the dataframe to a CSV file including the date and time in the filename, if the file already exists, throw an error
    coefficients_df.to_csv(filename, index=False)

In [13]:

def print_class_accuray(conf_matrix):
    # Calculate for each of the expected classes, what percentage of the predictions are correct
    correct_percentage_0 = conf_matrix[0,0] / np.sum(conf_matrix[0,:]) # 0
    correct_percentage_1 = conf_matrix[1,1] / np.sum(conf_matrix[1,:]) # 1
    correct_percentage_2 = conf_matrix[2,2] / np.sum(conf_matrix[2,:]) # 2
    correct_percentage_3 = conf_matrix[3,3] / np.sum(conf_matrix[3,:]) # 3

    # Print the percentages
    print("Percentage of correct predictions for class 0: {:.2f}%".format(correct_percentage_0 * 100))
    print("Percentage of correct predictions for class 1: {:.2f}%".format(correct_percentage_1 * 100))
    print("Percentage of correct predictions for class 2: {:.2f}%".format(correct_percentage_2 * 100))
    print("Percentage of correct predictions for class 3: {:.2f}%".format(correct_percentage_3 * 100))

    # Calculate the acuracy of predicting 0 or 1
    correct_percentage_0_1 = (conf_matrix[0,0] + conf_matrix[1,1]) / np.sum(conf_matrix[0:2,:]) # 0 or 1

    # Calculate the acuracy of predicting 2 or 3
    correct_percentage_2_3 = (conf_matrix[2,2] + conf_matrix[3,3]) / np.sum(conf_matrix[2:4,:]) # 2 or 3

    # Print the percentages
    print("Percentage of correct predictions for class 0 or 1: {:.2f}%".format(correct_percentage_0_1 * 100))
    print("Percentage of correct predictions for class 2 or 3: {:.2f}%".format(correct_percentage_2_3 * 100))



In [14]:
def duplicate_rows(df, label, times):
    # Get the rows where the label is 'label
    df_label = df[df['label'] == label]

    # Duplicate the rows
    df_label = pd.concat([df_label] * times)

    # Add the duplicated rows to the original dataframe
    df = pd.concat([df, df_label])

    return df

In [19]:
X_train_ES

Unnamed: 0,tl -250 to -100 self,tl -100 to -20 self,tl -20 to 20 self,tl 20 to 100 self,tl 100 to 250 self,th -250 to -100 self,th -100 to -20 self,th -20 to 20 self,th 20 to 100 self,th 100 to 250 self,...,tf -100 to -20 self,tf -20 to 20 self,tf 20 to 100 self,tf 100 to 250 self,tm -250 to -100 self,tm -100 to -20 self,tm -20 to 20 self,tm 20 to 100 self,tm 100 to 250 self,spectral_tilt
10570,0.172540,0.134069,0.058092,0.000000,0.030189,0.040897,0.000000,0.000000,0.006941,0.000000,...,0.000,0.00,0.0,0.000000,0.0,0.000,0.00,0.0,0.0,-3.009495e-06
86678,0.203638,0.358805,0.204471,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.155825,...,0.125,0.00,0.0,0.714286,0.0,0.000,0.00,0.0,0.0,-9.470217e-05
90586,0.029215,0.019095,0.051572,0.000000,0.002519,0.047591,0.176011,0.000000,0.000000,0.082482,...,0.250,0.00,0.0,1.000000,0.0,0.375,0.00,0.0,0.0,-5.795464e-07
57404,0.000000,0.079502,0.000000,0.000000,0.000000,0.000000,0.027136,0.095908,0.014032,0.000000,...,0.000,0.00,0.0,0.000000,0.0,0.000,0.00,0.0,0.0,-5.473439e-05
73581,0.000000,0.000000,0.000000,0.000000,0.058151,0.199344,0.234850,0.389273,0.289673,0.028884,...,0.125,0.25,1.0,0.571429,0.0,0.500,0.75,0.0,0.0,-1.232499e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.105020,0.274729,0.000000,0.000000,0.000000,0.022756,0.000000,0.000000,0.000000,0.000000,...,0.375,0.00,0.0,0.000000,0.0,0.000,0.00,0.0,0.0,-2.901345e-07
54886,0.142836,0.130844,0.000000,0.000000,0.141108,0.000000,0.008045,0.000000,0.000000,0.071378,...,0.000,0.00,0.0,0.000000,0.0,0.000,0.00,0.0,0.0,-2.783941e-04
76820,0.000000,0.062464,0.000000,0.000000,0.000000,0.124501,0.000196,0.104713,0.324687,0.367437,...,1.000,1.00,1.0,0.500000,0.2,0.000,0.00,0.0,0.0,-5.218985e-04
860,0.093173,0.000000,0.049787,0.239506,0.135211,0.027402,0.044153,0.000000,0.120860,0.033874,...,0.000,0.25,0.0,0.000000,0.0,0.000,0.00,0.0,0.0,-2.518978e-05


# Get dataframes

In [21]:
# Get the data
df_all_ES, sample_ids = get_dataframe("ES")
# Create the model
model_ES, X_train_ES, y_train_ES, X_test_ES, y_test_ES = create_model(df_all_ES)

# Get the data
df_all_EN, sample_ids = get_dataframe("EN")

# Create the model
model_EN, X_train_EN, y_train_EN ,X_test_EN, y_test_EN = create_model(df_all_EN)

# Get the data
X_train_knn_ES, y_train_knn_ES, X_test_knn_ES, y_test_knn_ES = get_dataframes_knn("ES")

# Get the data
X_train_knn_EN, y_train_knn_EN, X_test_knn_EN, y_test_knn_EN = get_dataframes_knn("EN")

In [32]:
# Object to z-standarize data
scaler = StandardScaler()

# Get the data
df, sample_ids = get_dataframe("ES")

df_all_ES = scaler.fit_transform(df.iloc[:, :-1])
df_all_ES = pd.DataFrame(df_all_ES, columns=df.columns[:-1])
df_all_ES[df.columns[-1]] = df[df.columns[-1]]

# Duplicate rows where the label is 3
# df_all_ES = duplicate_rows(df_all_ES, 3, 2)

# Create the model
model_ES, X_train_ES, y_train_ES, X_test_ES, y_test_ES = create_model(df_all_ES)

# Get the data
df, sample_ids = get_dataframe("EN")

df_all_EN = scaler.fit_transform(df.iloc[:, :-1])
df_all_EN = pd.DataFrame(df_all_EN, columns=df.columns[:-1])
df_all_EN[df.columns[-1]] = df[df.columns[-1]]

# Duplicate rows where the label is 3
# df_all_EN = duplicate_rows(df_all_EN, 3, 2)

# Correlations

## ES

In [18]:
language_model = 'ES' # 'EN' or 'ES'
language_test = 'ES' # 'EN' or 'ES'

plot_correlations(df_all_ES, language_model, language_test)

## EN

In [22]:
language_model = 'EN' # 'EN' or 'ES'
language_test = 'EN' # 'EN' or 'ES'

plot_correlations(df_all_EN, language_model, language_test)

# Baseline

## ES

In [60]:
# Predict the test set as the most common label
y_test_pred_most_common = np.full(y_test_ES.shape, y_train_ES.mode()[0])

# Calculate the accuracy
accuracy = accuracy_score(y_test_ES, y_test_pred_most_common)

# Print the accuracy
print("Accuracy of the most common label model: {:.2f}%".format(accuracy * 100))

Accuracy of the most common label model: 38.09%


## EN

In [59]:
# Predict the test set as the most common label
y_test_pred_most_common = np.full(y_test_EN.shape, y_train_EN.mode()[0])

# Calculate the accuracy
accuracy = accuracy_score(y_test_EN, y_test_pred_most_common)

# Print the accuracy
print("Accuracy of the most common label model: {:.2f}%".format(accuracy * 100))

Accuracy of the most common label model: 38.24%


# Linear Regression

## ES

In [68]:
language_model = 'ES' # 'EN' or 'ES'
language_test = 'ES' # 'EN' or 'ES'

# Get the predictions
unrounded_predictions, predictions = get_predictions(model_ES, X_test_ES)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test_ES, predictions)

plot_conf_matrix(conf_matrix, language_model, language_test)

print_class_accuray(conf_matrix)

# Get the accuracy
accuracy = accuracy_score(y_test_ES, predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_ES)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, language_model, language_test)


# Save the coefficients to a CSV file "C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv"
save_coefficients(model_ES, X_test_ES, corr, language_model)

Percentage of correct predictions for class 0: 0.36%
Percentage of correct predictions for class 1: 99.38%
Percentage of correct predictions for class 2: 0.79%
Percentage of correct predictions for class 3: 0.00%
Accuracy: 36.89%
Correlation between expected and predicted: 0.16893192688379732


## EN

In [69]:
language_model = 'EN' # 'EN' or 'ES'
language_test = 'EN' # 'EN' or 'ES'

# Get the predictions
unrounded_predictions, predictions = get_predictions(model_EN, X_test_EN)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test_EN, predictions)

plot_conf_matrix(conf_matrix, language_model, language_test)

print_class_accuray(conf_matrix)

# Get the accuracy
accuracy = accuracy_score(y_test_EN, predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_EN)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, language_model, language_test)

# Get the accuracy
accuracy = accuracy_score(y_test_EN, predictions)

# Save the coefficients to a CSV file "C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv"
save_coefficients(model_EN, X_test_EN, corr, language_model)

Percentage of correct predictions for class 0: 0.62%
Percentage of correct predictions for class 1: 91.12%
Percentage of correct predictions for class 2: 14.03%
Percentage of correct predictions for class 3: 0.09%
Accuracy: 34.28%
Correlation between expected and predicted: 0.24331494551829383


## ES predicts EN

In [70]:
language_model = 'ES' # 'EN' or 'ES'
language_test = 'EN' # 'EN' or 'ES'

# Get the predictions
unrounded_predictions, predictions = get_predictions(model_ES, X_test_EN)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test_EN, predictions)

print_class_accuray(conf_matrix)

# Get the accuracy
accuracy = accuracy_score(y_test_EN, predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

plot_conf_matrix(conf_matrix, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_EN)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, language_model, language_test)

Percentage of correct predictions for class 0: 0.54%
Percentage of correct predictions for class 1: 99.38%
Percentage of correct predictions for class 2: 0.74%
Percentage of correct predictions for class 3: 0.00%
Accuracy: 34.56%


Correlation between expected and predicted: 0.16355322559135016


## EN predicts ES

In [71]:
language_model = 'EN' # 'EN' or 'ES'
language_test = 'ES' # 'EN' or 'ES'

# Get the predictions
unrounded_predictions, predictions = get_predictions(model_EN, X_test_ES)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test_ES, predictions)

print_class_accuray(conf_matrix)

# Get the accuracy
accuracy = accuracy_score(y_test_ES, predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

plot_conf_matrix(conf_matrix, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_ES)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, language_model, language_test)

Percentage of correct predictions for class 0: 0.44%
Percentage of correct predictions for class 1: 90.16%
Percentage of correct predictions for class 2: 13.36%
Percentage of correct predictions for class 3: 0.00%
Accuracy: 36.91%


Correlation between expected and predicted: 0.12295387875512082


# KNN Models

## EN

In [58]:
language_model = 'EN' # 'EN' or 'ES'
language_test = 'EN' # 'EN' or 'ES'

# Instantiate the KNeighborsClassifier object with the number of neighbors you want to consider
knn_model_EN = KNeighborsClassifier(n_neighbors=2)

# Fit the model to the training data
knn_model_EN.fit(X_train_knn_EN, y_train_knn_EN)
unrounded_predictions, predictions = get_predictions(knn_model_EN, X_test_knn_EN)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test_knn_EN, predictions)

print_class_accuray(conf_matrix)

# Get the accuracy
accuracy = accuracy_score(y_test_knn_EN, predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))

plot_conf_matrix(conf_matrix, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_knn_EN)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, language_model, language_test)


# Save the coefficients to a CSV file "C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv"
# save_coefficients(knn_model_EN, X_test_EN, corr, language_model)

Percentage of correct predictions for class 0: 46.64%
Percentage of correct predictions for class 1: 37.55%
Percentage of correct predictions for class 2: 14.35%
Percentage of correct predictions for class 3: 3.37%
Accuracy: 35.72%


Correlation between expected and predicted: 0.014550766860974937


## ES

In [19]:
language_model = 'ES' # 'EN' or 'ES'
language_test = 'ES' # 'EN' or 'ES'

# Instantiate the KNeighborsClassifier object with the number of neighbors you want to consider
knn_model_ES = KNeighborsClassifier(n_neighbors=1)

# Fit the model to the training data
knn_model_ES.fit(X_train_knn_ES, y_train_knn_ES)
unrounded_predictions, predictions = get_predictions(knn_model_ES, X_test_knn_ES)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test_knn_ES, predictions)

print_class_accuray(conf_matrix)

# Get the accuracy
accuracy = accuracy_score(y_test_knn_ES, predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

plot_conf_matrix(conf_matrix, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_knn_ES)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, language_model, language_test)

# Save the coefficients to a CSV file "C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv"
# save_coefficients(knn_model_ES, X_test_ES, corr, language_model)

Percentage of correct predictions for class 0: 31.41%
Percentage of correct predictions for class 1: 40.99%
Percentage of correct predictions for class 2: 28.80%
Percentage of correct predictions for class 3: 2.21%
Percentage of correct predictions for class 0 or 1: 36.65%
Percentage of correct predictions for class 2 or 3: 26.68%
Accuracy: 33.36%


Correlation between expected and predicted: 0.012312268725170538


# Data Exploration

In [58]:
# Function that given a number of miliseconds, prints the time in seconds
def print_time(t): # 60300
    seconds = t // 1000 # 60
    minutes = seconds // 60 # 1
    seconds = seconds % 60 # 0
    ms = (t % 1000) # 300
    # Print the time in seconds and fraction of seconds rounded to 2 decimals
    print("Time: {}:{}.{}".format(minutes, seconds, round(ms, 4)))

def minutes_to_milliseconds(minutes):
    return minutes * 60 * 1000

In [149]:
window_num = 5727
print_time(window_num * 10)


Time: 0:57.270


In [158]:
expected = 2
lower_bound = 0.4
upper_bound = 0.45

# Get row of dataframe where column expected has value 'expected' and predicted is between 'lower_bound' and 'upper_bound'
ids = [sample_ids[id] for id in (list(expected_predicted_df[(expected_predicted_df['expected'] == expected) & (expected_predicted_df['predicted'] > lower_bound) & (expected_predicted_df['predicted'] < upper_bound)].index))]

display(ids)

['ES_022_5727_right']