In [84]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [2]:
# Set the directory path
directory_path = 'C:\Research\labeled_features\ES'

# Create an empty list to store the dataframes
dataframes = []

# Iterate through all the files in the directory
for file in os.listdir(directory_path):
  # Check if the file is a CSV file
  if file.endswith('.csv'):
    # Read the CSV file into a Pandas dataframe
    df = pd.read_csv(os.path.join(directory_path, file))

    # Drop rows where the 'label' column is not what was expected
    df = df[df['label'].isin([0, 1, 2, 3])]

    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all the dataframes into a single dataframe
df_all = pd.concat(dataframes)
df_all = df_all.drop(df.columns[0], axis=1)

In [36]:
# Split the data into features and target
X = df_all.iloc[:, :25]
y = df_all['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
unrounded_predictions = model.predict(X_test)

# Round the predictions to the nearest integer
predictions = np.round(unrounded_predictions)


In [37]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Convert the confusion matrix to a Pandas dataframe
cm_df = pd.DataFrame(cm, index=['true 0', 'true 1', 'true 2', 'true 3'], columns=['pred 0', 'pred 1', 'pred 2', 'pred 3'])

# Create a heatmap of the confusion matrix
fig = px.imshow(cm_df, title='Confusion Matrix', text_auto=True)
fig.show()

# Calculate the correlations between the columns and the label
correlations = df_all.corr()['label'].iloc[:25]

# Create a scatter plot of the correlations
fig = px.scatter(x=correlations.index, y=correlations, title='Correlations')
fig.show()

# Create a dataframe from the correlations
correlations_df = pd.DataFrame(correlations)

correlations_df.style.background_gradient(cmap ='viridis')\
.set_properties(**{'font-size': '20px'})

Unnamed: 0,label
lp -250 to -100,-0.052709
lp -100 to -20,-0.041273
lp -20 to 20,-0.036414
lp 20 to 100,-0.028861
lp 100 to 250,-0.010676
hp -250 to -100,0.023455
hp -100 to -20,0.030936
hp -20 to 20,0.034568
hp 20 to 100,0.04589
hp 100 to 250,0.068893


In [64]:
df = pd.DataFrame(y_test)
df['predicted'] = pd.DataFrame(predictions)
nan_count = df.shape[0] - df['predicted'].isnull().sum()
print("NaN count: {}".format(df['predicted'].isnull().sum()))

NaN count: 5869


In [39]:
print("Coefficients: {} \nIntercept: {}".format(model.coef_, model.intercept_))


Coefficients: [-8.04386806e-02 -4.95772444e-03 -3.69438130e-02 -3.43945485e-02
 -7.66144802e-02  7.45918887e-02 -3.17246480e-02  3.80047098e-02
 -4.42108033e-02  1.13908377e-02 -3.83621198e-03 -2.97248679e-05
  9.18671767e-04  1.13002848e-03 -6.15041666e-03 -1.57060026e-03
  5.53977009e-04 -2.26573943e-04  7.09223698e-04  8.99296306e-05
  3.07097328e-02  1.69608120e-02 -1.97222536e-02  3.51513531e-02
  1.84542999e-01] 
Intercept: 0.9267186409263226


In [40]:
df

Unnamed: 0,label,predicted
33863,0.0,
916,1.0,1.0
25955,0.0,
15859,2.0,1.0
6284,1.0,1.0
...,...,...
11568,1.0,1.0
32020,1.0,
6758,0.0,1.0
13585,0.0,1.0


In [41]:
# Create a scatter plot of the predictions
fig = px.scatter(x=y_test, y=unrounded_predictions, title='Predictions')

# Set the range of the y-axis to 0-4
fig.update_yaxes(range=[0, 3])

# Set the labels for the x-axis and y-axis
fig.update_layout(xaxis_title='True values', yaxis_title='Predicted values')

# Show the plot
fig.show()

In [85]:
df_list = [("X_test", X_test), ("y_test", y_test), ("predictions", pd.DataFrame(predictions))]

for df_tup in df_list:

    print("Counting NaN values in all columns in {}".format(df_tup[0]))
    # Count the number of NaN values in each column
    nan_count = df_tup[1].isnull().sum()

    # Print the columns that have NaN values and their count
    print(pd.DataFrame(nan_count[nan_count > 0]), "\n")


Counting NaN values in all columns in X_test
Empty DataFrame
Columns: [0]
Index: [] 

Counting NaN values in all columns in y_test
Empty DataFrame
Columns: [0]
Index: [] 

Counting NaN values in all columns in predictions
Empty DataFrame
Columns: [0]
Index: [] 



In [87]:
expected_predicted_df = pd.DataFrame()
expected_predicted_df['expected'] = y_test
expected_predicted_df['predicted'] = unrounded_predictions
expected_predicted_df.corr()

Unnamed: 0,expected,predicted
expected,1.0,0.110567
predicted,0.110567,1.0


In [88]:
# Calculate the MSE between the 'expected' and 'predicted' columns
mse = mean_squared_error(expected_predicted_df['expected'], expected_predicted_df['predicted'])

# Print the MSE
print(mse)


0.6847114405470652


In [89]:
expected_predicted_df

Unnamed: 0,expected,predicted
33863,0.0,0.981468
916,1.0,1.080301
25955,0.0,1.044336
15859,2.0,1.213580
6284,1.0,1.038314
...,...,...
11568,1.0,0.939985
32020,1.0,0.878597
6758,0.0,1.081496
13585,0.0,0.969260
