In [43]:
import os
import datetime

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

In [2]:
def get_dataframe(language):
  # Set the directory path
  directory_path = 'C:\Research\labeled_features\{}'.format(language)

  # Create an empty list to store the dataframes
  dataframes = []

  # Iterate through all the files in the directory
  for file in os.listdir(directory_path):
    # Check if the file is a CSV file
    if file.endswith('.csv'):
      # Read the CSV file into a Pandas dataframe
      df = pd.read_csv(os.path.join(directory_path, file))

      # Drop rows where the 'label' column is not what was expected
      df = df[df['label'].isin([0, 1, 2, 3])]

      # Append the dataframe to the list
      dataframes.append(df)

  # Concatenate all the dataframes into a single dataframe
  df_all = pd.concat(dataframes)

  # Model was predicting negative values, so I had to remove the negative values
  df_all = df_all[df_all['label'].isin([0, 1, 2, 3])]

  # Add column with the index of each row, reset the indices
  df_all = df_all.reset_index(drop=True)

  # Create a list where at each index is the sample_id of that row
  sample_ids = df_all['sample_id'].tolist()

  # Drop the 'sample_id' column
  return df_all.drop(columns=['sample_id']), sample_ids

In [21]:
def get_dataframes_knn(language):
  # Set the directory path
  directory_path = 'C:\Research\labeled_features\{}'.format(language)

  # Create an empty list to store the train dataframes
  train_set = []

  # Create an empty list to store the test dataframes
  test = []

  # Number of files in directory
  num_files = len(os.listdir(directory_path))

  # Index of the last file that is lower than 80% of the total number of files rounded down
  first_train_file = int(np.floor(num_files * 0.8))

  # Iterate through all the files in the directory
  for i, file in enumerate(os.listdir(directory_path)):
    # Check if the file is a CSV file
    if file.endswith('.csv'):
      # Read the CSV file into a Pandas dataframe
      df = pd.read_csv(os.path.join(directory_path, file))

      # Drop rows where the 'label' column is not what was expected
      df = df[df['label'].isin([0, 1, 2, 3])]

      # Append the dataframe to the corresponding list
      if i < first_train_file:
        train_set.append(df)
      else:
        test.append(df)

  # Concatenate the test dataframes into a single dataframe
  df_test = pd.concat(test).drop(columns=['sample_id'])

  # Concatenate the train dataframes into a single dataframe
  df_train = pd.concat(train_set).drop(columns=['sample_id'])

  # Split the train dataframe into X and y
  X = df_train.iloc[:, :-1]
  y = df_train['label']

  # Split the test dataframe into X and y
  X_test = df_test.iloc[:, :-1]
  y_test = df_test['label']

  return X, y, X_test, y_test


In [3]:
# # Set dataframe x to include only the features at the specified indices by list 'indices'
# X = pd.concat([df_all_backup.iloc[:, 5:10], df_all_backup.iloc[:, 14], df_all_backup.iloc[:, 54]], axis=1)
# X

In [4]:
def create_model(df_all):
    # Split the data into features and target
    X = df_all.iloc[:, :-1]
    df_all_backup = df_all
    y = df_all['label']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create a linear regression model
    model = LinearRegression()

    # Fit the model to the training data
    model.fit(X_train, y_train)

    return model, X_train, y_train, X_test, y_test

In [5]:
def get_predictions(model, X_test):
    # Make predictions on the testing data
    unrounded_predictions = model.predict(X_test)

    # Round the predictions to the nearest integer
    predictions = np.round(unrounded_predictions)

    # Make all negative predictions 0
    predictions[predictions < 0] = 0

    return unrounded_predictions, predictions

In [6]:
def plot_conf_matrix(y_test, predictions, language_model, language_test):
    # Calculate the confusion matrix
    cm = confusion_matrix(y_test, predictions)

    # Convert the confusion matrix to a Pandas dataframe
    cm_df = pd.DataFrame(cm)#, index=['true 0', 'true 1', 'true 2', 'true 3'], columns=['pred 0', 'pred 1', 'pred 2', 'pred 3'])

    # Create a heatmap of the confusion matrix
    fig = px.imshow(cm_df, title='Confusion Matrix', text_auto=True)
    fig.update_layout(title='Confusion Matrix Predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Predicted', yaxis_title='Expected')
    fig.show()

In [7]:
def plot_correlations(df_all, language_model, language_test):
    # Calculate the correlations between the columns and the label
    correlations = df_all.corr()['label'].iloc[:-1]

    # Create a scatter plot of the correlations
    fig = px.scatter(x=correlations.index, y=correlations, title='Correlations')
    fig.update_layout(title='Correlations between features and label, predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Feature', yaxis_title='Correlation')
    fig.show()

    # Create a dataframe from the correlations
    correlations_df = pd.DataFrame(correlations)

    # correlations_df.style.background_gradient(cmap ='viridis')\
    # .set_properties(**{'font-size': '20px'})

In [8]:
def compute_correlation(unrounded_predictions, y_test):
    expected_predicted_df = pd.DataFrame()
    expected_predicted_df['expected'] = y_test
    expected_predicted_df['predicted'] = unrounded_predictions
    corr = expected_predicted_df.corr()['predicted'].iloc[0]
    return expected_predicted_df, corr

In [9]:
def plot_expected_vs_predicted(expected_predicted_df, language_model, language_test):
  fig = go.Figure()

  labels = expected_predicted_df['expected'].unique()

  for label in labels:
    fig.add_trace(go.Violin(
    x=expected_predicted_df['expected'][expected_predicted_df['expected'] == label],
    y=expected_predicted_df['predicted'][expected_predicted_df['expected'] == label],
    name=label,
    box_visible=True,
    meanline_visible=False,
    ))

  fig.update_layout(title='Expected vs Predicted, predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Expected', yaxis_title='Predicted')
  fig.show()

In [10]:
def save_coefficients(model, X, corr, language):

    #Switch case based on the type of the model
    if type(model) == LinearRegression:
        model_type = 'linear_regression'
    # elif type(model) == sklearn.neighbors._classification.KNeighborsClassifier:
    #     model_type = 'knn_classifier'
        

    # Create a dataframe with the the coefficients and the intercept
    coefficients_df = pd.DataFrame()
    coefficients_df['feature'] = X.columns
    coefficients_df['coefficient'] = model.coef_

    # Add row in column 'feature' for the intercept and set the value to 'intercept'
    coefficients_df.loc[-1] = ['intercept', model.intercept_]

    # Add row for the correlation between the label and the features
    coefficients_df.loc[-2] = ['correlation', corr]

    filename = 'C:\Research\Results\{lang}\{model_type}coefficients_{date}_corr_{corr}.csv'.format(model_type=model_type, lang=language, date=datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), corr=corr)

    # Write the dataframe to a CSV file including the date and time in the filename, if the file already exists, throw an error
    coefficients_df.to_csv(filename, index=False)

# KNN

In [11]:
from sklearn.metrics import pairwise_distances

In [38]:
# Calculate pairwise Euclidean distances between all samples in the test set
distances = pairwise_distances(X_test, metric='euclidean')

# Set the diagonal to a large value to exclude the sample itself
np.fill_diagonal(distances, np.inf)

# Find the indices of the closest sample in the test set to each sample
closest_samples = np.argmin(distances, axis=1)

# Driver

In [12]:
language_model = 'ES' # 'EN' or 'ES'
language_test = 'ES' # 'EN' or 'ES'

# Get the data
df_all, sample_ids = get_dataframe(language_model)
# Create the model
model_ES, X_train_ES, y_train_ES, X_test_ES, y_test_ES = create_model(df_all)
# Get the predictions
unrounded_predictions, predictions = get_predictions(model_ES, X_test_ES)

plot_conf_matrix(y_test_ES, predictions, language_model, language_test)
plot_correlations(df_all, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_ES)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, sample_ids, language_model, language_test)

# Save the coefficients to a CSV file "C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv"
save_coefficients(model_ES, X_test_ES, corr, language_model)

Correlation between expected and predicted: 0.13872989235374736


In [13]:
# Create the model
model_EN, X_train_EN, y_train_EN ,X_test_EN, y_test_EN = create_model(df_all)
# Get the predictions
unrounded_predictions, predictions = get_predictions(model_EN, X_test_EN)

plot_conf_matrix(y_test_EN, predictions, language_model, language_test)
plot_correlations(df_all, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_EN)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, sample_ids, language_model, language_test)

# Save the coefficients to a CSV file "C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv"
save_coefficients(model_EN, X_test_EN, corr, language_model)

Correlation between expected and predicted: 0.21506412010982734


In [14]:
language_model = 'ES' # 'EN' or 'ES'
language_test = 'EN' # 'EN' or 'ES'

# Get the predictions
unrounded_predictions, predictions = get_predictions(model_ES, X_test_EN)

plot_conf_matrix(y_test_EN, predictions, language_model, language_test)
plot_correlations(df_all, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_EN)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, sample_ids, language_model, language_test)

Correlation between expected and predicted: 0.15659200293006778


In [15]:
language_model = 'EN' # 'EN' or 'ES'
language_test = 'ES' # 'EN' or 'ES'

# Get the predictions
unrounded_predictions, predictions = get_predictions(model_EN, X_test_ES)

plot_conf_matrix(y_test_ES, predictions, language_model, language_test)
plot_correlations(df_all, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_ES)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, sample_ids, language_model, language_test)

Correlation between expected and predicted: 0.11311212397303713


# KNN Models

In [26]:
print(X_train_knn_EN.shape)
print(y_train_knn_EN.shape)
print(X_test_knn_EN.shape)
print(y_test_knn_EN.shape)

(105318, 61)
(105318,)
(19569, 61)
(19569,)


In [38]:
language_model = 'EN' # 'EN' or 'ES'
language_test = 'EN' # 'EN' or 'ES'

# Get the data
X_train_knn_EN, y_train_knn_EN, X_test_knn_EN, y_test_knn_EN = get_dataframes_knn(language_model)

# Instantiate the KNeighborsClassifier object with the number of neighbors you want to consider
knn_model_EN = KNeighborsClassifier(n_neighbors=2)

# Fit the model to the training data
knn_model_EN.fit(X_train_knn_EN, y_train_knn_EN)
unrounded_predictions, predictions = get_predictions(knn_model_EN, X_test_knn_EN)

plot_conf_matrix(y_test_knn_EN, predictions, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_knn_EN)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, sample_ids, language_model, language_test)

# Save the coefficients to a CSV file "C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv"
# save_coefficients(knn_model_EN, X_test_EN, corr, language_model)

Correlation between expected and predicted: 0.014550766860974937


In [39]:
# Calculate accuracy of the model
accuracy = knn_model_EN.score(X_test_knn_EN, y_test_knn_EN)
accuracy

0.3571976084623639

In [17]:
language_model = 'ES' # 'EN' or 'ES'
language_test = 'ES' # 'EN' or 'ES'

# Instantiate the KNeighborsClassifier object with the number of neighbors you want to consider
knn_model_ES = KNeighborsClassifier(n_neighbors=1)

# Fit the model to the training data
knn_model_ES.fit(X_train_ES, y_train_ES)
unrounded_predictions, predictions = get_predictions(knn_model_ES, X_test_ES)

plot_conf_matrix(y_test_ES, predictions, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_ES)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, sample_ids, language_model, language_test)

# Save the coefficients to a CSV file "C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv"
# save_coefficients(knn_model_ES, X_test_ES, corr, language_model)

Correlation between expected and predicted: 0.8045132518180177


# Feedforward Neural Network

In [40]:
# Create feed forward model that works with a classification problem
model_ffnn_EN = Sequential()

# Add the input layer
model_ffnn_EN.add(Dense(61, input_dim=61, activation='relu'))

# Add the hidden layer
model_ffnn_EN.add(Dense(61, activation='relu'))

# Add the hidden layer
model_ffnn_EN.add(Dense(61, activation='relu'))

# Add the hidden layer
model_ffnn_EN.add(Dense(61, activation='relu'))

# Add the hidden layer
model_ffnn_EN.add(Dense(4, activation='relu'))

# Add the output layer
model_ffnn_EN.add(Dense(1, activation='sigmoid'))

# Compile the model
model_ffnn_EN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
batch_size = 32
epochs = 5
model_ffnn_EN.fit(X_train_EN, y_train_EN, batch_size=batch_size, epochs=epochs, validation_data=(X_test_EN, y_test_EN))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x20ad002f0a0>

In [44]:
model = Sequential()

# Add a 1D convolutional layer with 64 filters and a kernel size of 3
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(None, 61)))

# Add a max pooling layer with a pool size of 2
model.add(MaxPooling1D(pool_size=2))

# Add a 1D convolutional layer with 32 filters and a kernel size of 3
model.add(Conv1D(32, kernel_size=3, activation='relu'))

# Add a max pooling layer with a pool size of 2
model.add(MaxPooling1D(pool_size=2))

# Add a 1D convolutional layer with 16 filters and a kernel size of 3
model.add(Conv1D(16, kernel_size=3, activation='relu'))

# Flatten the output of the convolutional layers
model.add(Flatten())

# Add a dropout layer with a rate of 0.2 to prevent overfitting
model.add(Dropout(0.2))

# Add a dense layer with 64 units and a 'relu' activation
model.add(Dense(64, activation='relu'))

# Add a dropout layer with a rate of 0.2 to prevent overfitting
model.add(Dropout(0.2))

# Add the output layer
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

ValueError: The last dimension of the inputs to a Dense layer should be defined. Found None. Full input shape received: (None, None)

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from sklearn.model_selection import train_test_split

# Prepare the data
X = ... # input data, a numpy array of shape (num_samples, num_time_steps, num_features)
y = ... # labels, a numpy array of shape (num_samples,)
y = to_categorical(y) # one-hot encode the labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the model
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(None, 61)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(16, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print('Test accuracy:', test_acc)


In [63]:
# Get the predictions
unrounded_predictions, predictions = get_predictions(model_ffnn_EN, X_test_EN)

plot_conf_matrix(y_test_EN, predictions, language_model, language_test)
plot_correlations(df_all, language_model, language_test)

expected_predicted_df, corr = compute_correlation(unrounded_predictions, y_test_EN)
print("Correlation between expected and predicted: {}".format(corr))

plot_expected_vs_predicted(expected_predicted_df, sample_ids, language_model, language_test)



Correlation between expected and predicted: 0.3690639298163541


# Data Exploration

In [58]:
# Function that given a number of miliseconds, prints the time in seconds
def print_time(t): # 60300
    seconds = t // 1000 # 60
    minutes = seconds // 60 # 1
    seconds = seconds % 60 # 0
    ms = (t % 1000) # 300
    # Print the time in seconds and fraction of seconds rounded to 2 decimals
    print("Time: {}:{}.{}".format(minutes, seconds, round(ms, 4)))

def minutes_to_milliseconds(minutes):
    return minutes * 60 * 1000

In [149]:
window_num = 5727
print_time(window_num * 10)


Time: 0:57.270


In [158]:
expected = 2
lower_bound = 0.4
upper_bound = 0.45

# Get row of dataframe where column expected has value 'expected' and predicted is between 'lower_bound' and 'upper_bound'
ids = [sample_ids[id] for id in (list(expected_predicted_df[(expected_predicted_df['expected'] == expected) & (expected_predicted_df['predicted'] > lower_bound) & (expected_predicted_df['predicted'] < upper_bound)].index))]

display(ids)

['ES_022_5727_right']