In [1]:
import os
import datetime

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [2]:
language = 'ES' # 'EN' or 'ES'

In [3]:
# Set the directory path
directory_path = 'C:\Research\labeled_features\{}'.format(language)

# Create an empty list to store the dataframes
dataframes = []

# Iterate through all the files in the directory
for file in os.listdir(directory_path):
  # Check if the file is a CSV file
  if file.endswith('.csv'):
    # Read the CSV file into a Pandas dataframe
    df = pd.read_csv(os.path.join(directory_path, file))

    # Drop rows where the 'label' column is not what was expected
    df = df[df['label'].isin([0, 1, 2, 3])]

    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into a single dataframe
df_all = pd.concat(dataframes)
df_all = df_all.drop(df.columns[0], axis=1)

In [4]:
# Split the data into features and target
X = df_all.iloc[:, :-1]
y = df_all['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
unrounded_predictions = model.predict(X_test)

# Round the predictions to the nearest integer
predictions = np.round(unrounded_predictions)


In [10]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Convert the confusion matrix to a Pandas dataframe
cm_df = pd.DataFrame(cm, index=['true 0', 'true 1', 'true 2', 'true 3'], columns=['pred 0', 'pred 1', 'pred 2', 'pred 3'])

# Create a heatmap of the confusion matrix
fig = px.imshow(cm_df, title='Confusion Matrix', text_auto=True)
fig.show()

# Calculate the correlations between the columns and the label
correlations = df_all.corr()['label'].iloc[:-1]

# Create a scatter plot of the correlations
fig = px.scatter(x=correlations.index, y=correlations, title='Correlations')
fig.show()

# Create a dataframe from the correlations
correlations_df = pd.DataFrame(correlations)

correlations_df.style.background_gradient(cmap ='viridis')\
.set_properties(**{'font-size': '20px'})

Unnamed: 0,label
tl -250 to -100 self,-0.050459
tl -100 to -20 self,-0.044225
tl -20 to 20 self,-0.041581
tl 20 to 100 self,-0.0374
tl 100 to 250 self,-0.027388
th -250 to -100 self,0.042472
th -100 to -20 self,0.041694
th -20 to 20 self,0.042494
th 20 to 100 self,0.050878
th 100 to 250 self,0.066693


In [18]:
print("Coefficients: {} \nIntercept: {}".format(model.coef_, model.intercept_))

Coefficients: [-1.80722353e-01  2.82964199e-02 -1.19970821e-01  1.18085660e-01
  2.86790150e-02  1.03822700e-01  1.67223170e-02  6.89077546e-02
  1.89025959e-01  3.06206292e-01  7.57603947e-02  2.89924165e-02
 -4.74192179e-02  6.82613399e-02  1.17086596e-01 -2.53257483e-03
 -1.46761708e-04  7.43635514e-04  1.36431903e-03 -2.18229461e-03
 -9.34491857e-04  3.48276012e-04 -2.38498265e-04  3.05867134e-04
  1.09431750e-03  7.86779402e-02  1.32132867e-02 -4.18540557e-05
  2.23196877e-02  3.27635103e-03 -2.72372075e-03 -4.40657471e-03
 -8.87935234e-04 -1.11157700e-02 -6.63915420e-03  8.63145226e-02
  3.19184690e-02  4.00836610e-02  3.73187560e-02  1.31591328e-02
  2.36402634e-02 -8.19956456e-03  1.91060786e-02 -3.26314429e-03
 -3.35243782e-02  3.27360790e-03  3.74902942e-02  1.60102582e-01
  1.67409526e-03  1.68766698e-01 -4.14628545e-03 -6.18346112e-04
  1.14599037e-02 -1.07671548e-03  4.42154839e-03 -4.11604126e+02
 -1.22411149e+03 -1.13299428e+03 -1.55624037e+03 -5.74458424e+03] 
Intercept

In [19]:
expected_predicted_df = pd.DataFrame()
expected_predicted_df['expected'] = y_test
expected_predicted_df['predicted'] = unrounded_predictions
expected_predicted_df.corr()
corr = expected_predicted_df.corr()['predicted'].iloc[0]
print("Correlation between expected and predicted: {}".format(corr))

Correlation between expected and predicted: 0.14013283923150366


In [20]:
fig = go.Figure()

labels = expected_predicted_df['expected'].unique()

for label in labels:
  fig.add_trace(go.Violin(x=expected_predicted_df['expected'][expected_predicted_df['expected'] == label],
  y=expected_predicted_df['predicted'][expected_predicted_df['expected'] == label],
  name=label,
  box_visible=True,
  meanline_visible=False))

fig.show()

In [21]:
# Calculate the MSE between the 'expected' and 'predicted' columns
mse = mean_squared_error(expected_predicted_df['expected'], expected_predicted_df['predicted'])
print("MSE: {}".format(mse))

MSE: 0.6796406289608673


In [22]:
# Create a dataframe with the the coefficients and the intercept
coefficients_df = pd.DataFrame()
coefficients_df['feature'] = X.columns
coefficients_df['coefficient'] = model.coef_

# Add row in column 'feature' for the intercept and set the value to 'intercept'
coefficients_df.loc[-1] = ['intercept', model.intercept_]

# Add row for the correlation between the label and the features
coefficients_df.loc[-2] = ['correlation', corr]

In [23]:
coefficients_df

Unnamed: 0,feature,coefficient
0,tl -250 to -100 self,-0.180722
1,tl -100 to -20 self,0.028296
2,tl -20 to 20 self,-0.119971
3,tl 20 to 100 self,0.118086
4,tl 100 to 250 self,0.028679
...,...,...
57,pd -20 to 20 self,-1132.994283
58,pd 20 to 100 self,-1556.240372
59,pd 100 to 250 self,-5744.584243
-1,intercept,0.963488


In [25]:
filename = 'C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv'.format(lang=language, date=datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), corr=corr)

# Write the dataframe to a CSV file including the date and time in the filename, if the file already exists, throw an error
coefficients_df.to_csv(filename, index=False)

In [None]:
# Run neural network with tensorflow on X_train, y_train
