In [15]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Set the directory path
directory_path = 'C:\Research\labeled_features\ES'

# Create an empty list to store the dataframes
dataframes = []

# Iterate through all the files in the directory
for file in os.listdir(directory_path):
  # Check if the file is a CSV file
  if file.endswith('.csv'):
    # Read the CSV file into a Pandas dataframe
    df = pd.read_csv(os.path.join(directory_path, file))

    # Drop rows where the 'label' column is less than 0
    df = df[df['label'].isin([0, 1, 2, 3])]

    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into a single dataframe
df_all = pd.concat(dataframes)
df_all = df_all.drop(df.columns[0], axis=1)

In [3]:
# Split the data into features and target
X = df_all.iloc[:, :25]
y = df_all['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test)

# Round the predictions to the nearest integer
predictions = np.round(predictions)


In [22]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Convert the confusion matrix to a Pandas dataframe
cm_df = pd.DataFrame(cm, index=['true 0', 'true 1', 'true 2', 'true 3'], columns=['pred 0', 'pred 1', 'pred 2', 'pred 3'])

# Create a heatmap of the confusion matrix
fig = px.imshow(cm_df, title='Confusion Matrix', text_auto=True)
fig.show()

# Calculate the correlations between the columns and the label
correlations = df_all.corr()['label'].iloc[:25]

# Create a scatter plot of the correlations
fig = px.scatter(x=correlations.index, y=correlations, title='Correlations')
fig.show()

# Create a dataframe from the correlations
correlations_df = pd.DataFrame(correlations)

correlations_df.style.background_gradient(cmap ='viridis')\
.set_properties(**{'font-size': '20px'})

Unnamed: 0,label
lp -250 to -100,-0.052713
lp -100 to -20,-0.041275
lp -20 to 20,-0.036436
lp 20 to 100,-0.028906
lp 100 to 250,-0.010725
hp -250 to -100,0.023462
hp -100 to -20,0.030942
hp -20 to 20,0.034571
hp 20 to 100,0.045891
hp 100 to 250,0.068894


In [17]:
correlations_df

Unnamed: 0,label
lp -250 to -100,-0.052713
lp -100 to -20,-0.041275
lp -20 to 20,-0.036436
lp 20 to 100,-0.028906
lp 100 to 250,-0.010725
hp -250 to -100,0.023462
hp -100 to -20,0.030942
hp -20 to 20,0.034571
hp 20 to 100,0.045891
hp 100 to 250,0.068894
