In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.cm as cm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns

In [None]:
#read csv
df = pd.read_csv('fulldata_new1.csv', header=None)
df

In [None]:
#name columns
df.columns = ['Structure', 'index', 'relaxed', 't_half', 't2', 't3']
df["Structure"] = df["Structure"].str.replace("npStruct_","")
df

In [None]:
#null values in t half
df['t_half'].isnull().values.sum()

In [None]:
df = df[df['t_half'].notna()]
df['t_half'].isnull().values.sum()

In [None]:
plt.figure(figsize=(16,10))
plt.scatter(df['Structure'], df['t_half'], c=df['relaxed'], cmap='viridis', alpha=0.05)
plt.xticks(rotation='vertical')
plt.grid()
plt.legend()
plt.ylabel('t_half')
plt.xlabel('Structure')

In [None]:
median = df.groupby('Structure').median()
median.reset_index(inplace = True)
median

In [None]:
#assign colour to structure
cmap = cm.get_cmap('Spectral')
colour_dict = pd.Series({k:cmap(np.random.rand()) for k in df['Structure'].unique()})
colour_dict.name = 'colour_dict'
df = pd.merge(df, colour_dict, how='left', left_on='Structure', right_index=True)
df

In [None]:
# Create a figure and an Axes3D object
fig = plt.figure(figsize=(16,12))
ax = fig.add_subplot(111, projection='3d')

# Use the scatter function to plot the data points
ax.scatter(df['t_half'], df['t2'], df['t3'], c=df['colour_dict'])
ax.set_xlabel('t half')
ax.set_ylabel('t 1/3')
ax.set_zlabel('t 2/3')
plt.show()

In [None]:
X = median.drop(columns=['Structure', 'index', 'relaxed', 't2', 't3'])
X.head()

In [None]:
# Extract the data as a NumPy array
data = X.values

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
data_scaled = scaler.transform(X)

In [None]:
inertias = []

for i in range(1,10):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(data_scaled)
    inertias.append(kmeans.inertia_)
    
plt.plot(range(1,10), inertias, marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans_per_k = [KMeans(n_clusters=k).fit(data_scaled) for k in range(1,42)]

silhouette_scores = [silhouette_score(data_scaled, model.labels_)
                     for model in kmeans_per_k[1:]]

In [None]:
plt.plot(range(2,42), silhouette_scores, marker='o', label = 'silhouette curve')
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Silhouette score", fontsize=14)
plt.ylim(ymin=0)
plt.grid()

In [None]:
# Create a KMeans model with 3 clusters
kmeans = KMeans(n_clusters=3)

# Fit the model to the data
kmeans.fit(data_scaled)

# Predict the cluster labels for each data point
labels = kmeans.predict(data_scaled)

# Add the cluster labels to the DataFrame as a new column
X['cluster'] = labels
X.head()

In [None]:
plt.figure(figsize=(12.8,8))
plt.scatter(median['Structure'], median['t_half'], c=X['cluster'], cmap='viridis')
plt.xticks(rotation='vertical')
plt.grid()
plt.ylabel('t_half')
plt.xlabel('Structure')
plt.title('median t-half cluster plot')

In [None]:
df1 = median.join(X['cluster'])
df1.head()

In [None]:
df1 = df1.drop(columns=['index', 'relaxed', 't_half', 't2', 't3'])
df1.head()

In [None]:
inner = pd.merge(df, df1)
inner

In [None]:
inner.plot.scatter(x='Structure',
               y='t_half',
               alpha=0.01,
               c=inner['cluster'],
               cmap = 'viridis',
               figsize=(16,10),
               title='x intercept for different structures',
               grid=True,
               xlabel='Structures',
               ylabel='log t half')
plt.xticks(rotation='vertical')

# Classification

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

In [None]:
inner

In [None]:
X = inner.drop(columns = ['Structure','index', 'relaxed', 't2', 't3', 'colour_dict', 'cluster'])
y = inner['Structure']

In [None]:
df= pd.read_csv('fulldata.csv', header=None)
X = df.iloc[:,3:]
y = df.iloc[:,0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = (1)) #create random train test split

In [None]:
X.describe()

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_scaled= scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled= pd.DataFrame(X_train_scaled)
X_test_scaled = pd.DataFrame(X_test_scaled)
np.round(X_train_scaled.describe(), 2)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
logreg = RandomForestClassifier(random_state=1)
logreg.fit(X_train_scaled, y_train) #define logistic regression model
print('train accuracy =', logreg.score(X_train_scaled, y_train))
print('test accuracy =', logreg.score(X_test_scaled, y_test))

In [None]:
cross_val_score(logreg, X_train_scaled, y_train, cv=3, scoring='accuracy')

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(logreg, X_train_scaled, y_train, cv=3)
conf_mx = metrics.confusion_matrix(y_train, y_train_pred)
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show

In [None]:
plt.figure(figsize=(42,42))
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
print(metrics.classification_report(y_train, y_train_pred, zero_division=0))

In [None]:
plt.figure(figsize=(8,6))
sns.violinplot(x='cluster', y='t_half', data = inner)