In [1]:
import os
import datetime

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [2]:
language = 'EN' # 'EN' or 'ES'

In [3]:
# Set the directory path
directory_path = 'C:\Research\labeled_features\{}'.format(language)

# Create an empty list to store the dataframes
dataframes = []

# Iterate through all the files in the directory
for file in os.listdir(directory_path):
  # Check if the file is a CSV file
  if file.endswith('.csv'):
    # Read the CSV file into a Pandas dataframe
    df = pd.read_csv(os.path.join(directory_path, file))

    # Drop rows where the 'label' column is not what was expected
    df = df[df['label'].isin([0, 1, 2, 3])]

    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into a single dataframe
df_all = pd.concat(dataframes)
df_all = df_all.drop(df.columns[0], axis=1)

# Model was predicting negative values, so I had to remove the negative values
df_all = df_all[df_all['label'].isin([0, 1, 2, 3])]

In [4]:
# Split the data into features and target
X = df_all.iloc[:, :-1]
y = df_all['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
unrounded_predictions = model.predict(X_test)

# Round the predictions to the nearest integer
predictions = np.round(unrounded_predictions)


In [5]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Convert the confusion matrix to a Pandas dataframe
cm_df = pd.DataFrame(cm, index=['true 0', 'true 1', 'true 2', 'true 3'], columns=['pred 0', 'pred 1', 'pred 2', 'pred 3'])

# Create a heatmap of the confusion matrix
fig = px.imshow(cm_df, title='Confusion Matrix', text_auto=True)
fig.show()

ValueError: Shape of passed values is (5, 5), indices imply (4, 4)

In [6]:
# Calculate the correlations between the columns and the label
correlations = df_all.corr()['label'].iloc[:-1]

# Create a scatter plot of the correlations
fig = px.scatter(x=correlations.index, y=correlations, title='Correlations')
fig.show()

# Create a dataframe from the correlations
correlations_df = pd.DataFrame(correlations)

correlations_df.style.background_gradient(cmap ='viridis')\
.set_properties(**{'font-size': '20px'})

Unnamed: 0,label
tl -100 to -20 self,-0.059721
tl -20 to 20 self,-0.048853
tl 20 to 100 self,-0.05094
tl 100 to 250 self,-0.050754
th -250 to -100 self,0.115741
th -100 to -20 self,0.128921
th -20 to 20 self,0.119998
th 20 to 100 self,0.128869
th 100 to 250 self,0.133758
vo -250 to -100 self,0.029369


In [60]:
print("Coefficients: {} \nIntercept: {}".format(model.coef_, model.intercept_))

Coefficients: [ 1.86538406e-01  2.03263505e-02  1.74239826e-01  2.43200069e-01
  7.73824423e-01  4.69897622e-01  6.68665825e-02  3.64627456e-01
  7.09616785e-01  7.75325561e-02  8.22206595e-02  4.93763766e-02
  1.56919130e-02  1.90471570e-01  1.42821359e-03  3.54657742e-03
 -3.68647547e-03  4.78254645e-03  1.13238240e-02  4.81648336e-04
 -6.62785925e-04 -2.97025028e-04  7.87540614e-04  5.22032380e-03
  5.66037026e-03 -6.52439519e-02 -4.59829098e-02  3.57434087e-02
  3.04246832e-01 -6.12583269e-03 -8.59126053e-03  5.04481459e-03
 -1.05826935e-02 -2.76600706e-02  8.62237834e-02  7.69505393e-02
  7.09739745e-02  6.87237362e-02  8.77058331e-02 -1.65130389e-02
 -3.35241303e-02 -1.84029456e-03 -2.68936480e-02 -1.88809632e-02
  9.61588113e-02  4.92489759e-02  1.61863937e-01  1.11315618e-01
  2.11371203e-01 -1.06517528e-02 -1.31776588e-02 -1.51565593e-02
 -8.83123802e-03 -3.14435432e-03 -3.68522690e+02  4.55910090e+02
  1.07018732e+03  1.59520816e+03 -3.77551271e+01] 
Intercept: 1.135087332152

In [61]:
expected_predicted_df = pd.DataFrame()
expected_predicted_df['expected'] = y_test
expected_predicted_df['predicted'] = unrounded_predictions
expected_predicted_df.corr()
corr = expected_predicted_df.corr()['predicted'].iloc[0]
print("Correlation between expected and predicted: {}".format(corr))

Correlation between expected and predicted: 0.23467662577068293


In [62]:
fig = go.Figure()

labels = expected_predicted_df['expected'].unique()

for label in labels:
  fig.add_trace(go.Violin(x=expected_predicted_df['expected'][expected_predicted_df['expected'] == label],
  y=expected_predicted_df['predicted'][expected_predicted_df['expected'] == label],
  name=label,
  box_visible=True,
  meanline_visible=False))

fig.show()

In [63]:
# Calculate the MSE between the 'expected' and 'predicted' columns
mse = mean_squared_error(expected_predicted_df['expected'], expected_predicted_df['predicted'])
print("MSE: {}".format(mse))

MSE: 0.724673200020911


In [64]:
# Create a dataframe with the the coefficients and the intercept
coefficients_df = pd.DataFrame()
coefficients_df['feature'] = X.columns
coefficients_df['coefficient'] = model.coef_

# Add row in column 'feature' for the intercept and set the value to 'intercept'
coefficients_df.loc[-1] = ['intercept', model.intercept_]

# Add row for the correlation between the label and the features
coefficients_df.loc[-2] = ['correlation', corr]

In [65]:
coefficients_df

Unnamed: 0,feature,coefficient
0,tl -100 to -20 self,0.186538
1,tl -20 to 20 self,0.020326
2,tl 20 to 100 self,0.174240
3,tl 100 to 250 self,0.243200
4,th -250 to -100 self,0.773824
...,...,...
56,pd -20 to 20 self,1070.187317
57,pd 20 to 100 self,1595.208158
58,pd 100 to 250 self,-37.755127
-1,intercept,1.135087


In [66]:
filename = 'C:\Research\Results\{lang}\coefficients_{date}_corr_{corr}.csv'.format(lang=language, date=datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), corr=corr)

# Write the dataframe to a CSV file including the date and time in the filename, if the file already exists, throw an error
coefficients_df.to_csv(filename, index=False)