In [19]:
import os
import datetime

import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from sklearn.model_selection import train_test_split

In [2]:
def get_dataframe(language):
  # Set the directory path
  directory_path = 'C:\Research\labeled_features\{}'.format(language)

  # Create an empty list to store the dataframes
  dataframes = []

  # Iterate through all the files in the directory
  for file in os.listdir(directory_path):
    # Check if the file is a CSV file
    if file.endswith('.csv'):
      # Read the CSV file into a Pandas dataframe
      df = pd.read_csv(os.path.join(directory_path, file))

      # Drop rows where the 'label' column is not what was expected
      df = df[df['label'].isin([0, 1, 2, 3])]

      # Append the dataframe to the list
      dataframes.append(df)

  # Concatenate all the dataframes into a single dataframe
  df_all = pd.concat(dataframes)

  # Model was predicting negative values, so I had to remove the negative values
  df_all = df_all[df_all['label'].isin([0, 1, 2, 3])]

  # Add column with the index of each row, reset the indices
  df_all = df_all.reset_index(drop=True)

  # Create a list where at each index is the sample_id of that row
  sample_ids = df_all['sample_id'].tolist()

  # Drop the 'sample_id' column
  return df_all.drop(columns=['sample_id'])

In [3]:
def get_predictions(model, X_test):
    # Make predictions on the testing data
    unrounded_predictions = model.predict(X_test)

    # Round the predictions to the nearest integer
    predictions = np.round(unrounded_predictions)

    # Make all negative predictions 0
    predictions[predictions < 0] = 0

    return unrounded_predictions, predictions

In [4]:
def plot_conf_matrix(y_test, predictions, language_model, language_test):
    # Calculate the confusion matrix
    cm = confusion_matrix(y_test, predictions)

    # Convert the confusion matrix to a Pandas dataframe
    cm_df = pd.DataFrame(cm)#, index=['true 0', 'true 1', 'true 2', 'true 3'], columns=['pred 0', 'pred 1', 'pred 2', 'pred 3'])

    # Create a heatmap of the confusion matrix
    fig = px.imshow(cm_df, title='Confusion Matrix', text_auto=True)
    fig.update_layout(title='Confusion Matrix Predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Predicted', yaxis_title='Expected')
    fig.show()

In [5]:
def plot_correlations(df_all, language_model, language_test):
    # Calculate the correlations between the columns and the label
    correlations = df_all.corr()['label'].iloc[:-1]

    # Create a scatter plot of the correlations
    fig = px.scatter(x=correlations.index, y=correlations, title='Correlations')
    fig.update_layout(title='Correlations between features and label, predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Feature', yaxis_title='Correlation')
    fig.show()

    # Create a dataframe from the correlations
    correlations_df = pd.DataFrame(correlations)

    # correlations_df.style.background_gradient(cmap ='viridis')\
    # .set_properties(**{'font-size': '20px'})

In [6]:
def compute_correlation(unrounded_predictions, y_test):
    expected_predicted_df = pd.DataFrame()
    expected_predicted_df['expected'] = y_test
    expected_predicted_df['predicted'] = unrounded_predictions
    corr = expected_predicted_df.corr()['predicted'].iloc[0]
    return expected_predicted_df, corr

In [7]:
def plot_expected_vs_predicted(expected_predicted_df, language_model, language_test):
  fig = go.Figure()

  labels = expected_predicted_df['expected'].unique()

  for label in labels:
    fig.add_trace(go.Violin(
    x=expected_predicted_df['expected'][expected_predicted_df['expected'] == label],
    y=expected_predicted_df['predicted'][expected_predicted_df['expected'] == label],
    name=label,
    box_visible=True,
    meanline_visible=False,
    ))

  fig.update_layout(title='Expected vs Predicted, predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Expected', yaxis_title='Predicted')
  fig.show()

In [14]:

def print_class_accuray(conf_matrix):
    # Calculate for each of the expected classes, what percentage of the predictions are correct
    correct_percentage_0 = conf_matrix[0,0] / np.sum(conf_matrix[0,:]) # 0
    correct_percentage_1 = conf_matrix[1,1] / np.sum(conf_matrix[1,:]) # 1
    correct_percentage_2 = conf_matrix[2,2] / np.sum(conf_matrix[2,:]) # 2
    correct_percentage_3 = conf_matrix[3,3] / np.sum(conf_matrix[3,:]) # 3

    # Print the percentages
    print("Percentage of correct predictions for class 0: {:.2f}%".format(correct_percentage_0 * 100))
    print("Percentage of correct predictions for class 1: {:.2f}%".format(correct_percentage_1 * 100))
    print("Percentage of correct predictions for class 2: {:.2f}%".format(correct_percentage_2 * 100))
    print("Percentage of correct predictions for class 3: {:.2f}%".format(correct_percentage_3 * 100))

    # Calculate the acuracy of predicting 0 or 1
    correct_percentage_0_1 = (conf_matrix[0,0] + conf_matrix[1,1]) / np.sum(conf_matrix[0:2,:]) # 0 or 1

    # Calculate the acuracy of predicting 2 or 3
    correct_percentage_2_3 = (conf_matrix[2,2] + conf_matrix[3,3]) / np.sum(conf_matrix[2:4,:]) # 2 or 3

    # Print the percentages
    print("Percentage of correct predictions for class 0 or 1: {:.2f}%".format(correct_percentage_0_1 * 100))
    print("Percentage of correct predictions for class 2 or 3: {:.2f}%".format(correct_percentage_2_3 * 100))



In [8]:
# Get the dataframe
df_english = get_dataframe('EN')

In [21]:
# Create a TensorBoard object with a log directory
logdir = "C:/Users/carlo/Research/reduction-detection/logs/fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=logdir)

# Create the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

In [25]:
X = df_english.iloc[:, :-1]
y = df_english['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the model
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(61,1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(16, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))


# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test), callbacks=[early_stopping, tensorboard_callback])

# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.7160660624504089


In [26]:
language_model = 'EN'
language_test = 'EN'

unrounded_predictions, predictions = get_predictions(model, X_test)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

print_class_accuray(conf_matrix)

# Get the accuracy
accuracy = accuracy_score(y_test, predictions)

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

# Create a heatmap of the confusion matrix
fig = px.imshow(conf_matrix, title='Confusion Matrix', text_auto=True)
fig.update_layout(title='Confusion Matrix Predicted with {} model, on {} data'.format(language_model, language_test), xaxis_title='Predicted', yaxis_title='Expected')
fig.show()

Percentage of correct predictions for class 0: 2.40%
Percentage of correct predictions for class 1: 97.31%
Percentage of correct predictions for class 2: 5.05%
Percentage of correct predictions for class 3: 0.00%
Percentage of correct predictions for class 0 or 1: 52.09%
Percentage of correct predictions for class 2 or 3: 3.95%
Accuracy: 39.11%


In [41]:
# Calculate what percentage of the predictions are correct
accuracy = accuracy_score(y_test_class, y_pred_class)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Calculate for each of the expected classes, what percentage of the predictions are correct
correct_percentage_0 = conf_mat[0, 0] / np.sum(conf_mat[0, :]) # 0
correct_percentage_1 = conf_mat[1, 1] / np.sum(conf_mat[1, :]) # 1
correct_percentage_2 = conf_mat[2, 2] / np.sum(conf_mat[2, :]) # 2
correct_percentage_3 = conf_mat[3, 3] / np.sum(conf_mat[3, :]) # 3

print('Correct percentage for class 0: {:.2f}%'.format(correct_percentage_0 * 100))
print('Correct percentage for class 1: {:.2f}%'.format(correct_percentage_1 * 100))
print('Correct percentage for class 2: {:.2f}%'.format(correct_percentage_2 * 100))
print('Correct percentage for class 3: {:.2f}%'.format(correct_percentage_3 * 100))

Accuracy: 59.53%
Correct percentage for class 0: 58.10%
Correct percentage for class 1: 80.49%
Correct percentage for class 2: 34.76%
Correct percentage for class 3: 20.47%
