# Esplorazione

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv('superconductors.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum().sort_values()

In [None]:
df.drop_duplicates(subset=None, inplace=True)
df.info()

In [None]:
df.hist(bins=50,figsize=(40,40))
plt.show()

## Scaling delle features

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(x_scaled, columns=list(df.columns))


In [None]:
df_scaled.hist(bins=50,figsize=(40,40))
plt.show()

In [None]:
cor = df_scaled.corr()

In [None]:
plt.figure(figsize=(50,40))
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
df_scaled.iloc[:, df_scaled.columns != 'critical_temp'].corrwith(df_scaled['critical_temp']).plot.bar(figsize= (20,10),title="Corr", fontsize=10, grid=True)

## Rimozione features non correlate con la temp crit o autocorrelate tra loro

In [None]:
#sns.set(style="ticks")
#sns.pairplot(df[["std_ThermalConductivity","range_atomic_radius","range_ThermalConductivity","wtd_std_ThermalConductivity","critical_temp"]], corner=True)

In [None]:
#corr_matrix = cor.abs()
#upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
#to_drop = [column for column in upper.columns if ( any(upper[column] > 0.90) or upper['critical_temp'][column] < 0.1 )]
#df_scaled.drop(to_drop, axis=1, inplace=True)

In [None]:
corr_matrix = cor.abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop=[]
for column in upper.columns:
    if upper['critical_temp'][column] < 0.1:
        to_drop.append(column)
    else:
        for column1 in upper.columns:
            if upper[column][column1] > 0.9:
                if upper['critical_temp'][column] > upper['critical_temp'][column1]:
                    to_drop.append(column1)
                else:
                    to_drop.append(column)
df_scaled.drop(to_drop, axis=1, inplace=True)
                
        

In [None]:
plt.figure(figsize=(50,40))
cor1 = df_scaled.corr()
sns.heatmap(cor1, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
df_scaled.iloc[:, df_scaled.columns != 'critical_temp'].corrwith(df_scaled['critical_temp']).plot.bar(figsize= (20,10),title="Corr", fontsize=10, grid=True)

In [None]:
abs(df_scaled.iloc[:, df_scaled.columns != 'critical_temp'].corrwith(df_scaled['critical_temp'])).sort_values().tail(50)

# Analisi globale

In [None]:
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import neighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
X = df_scaled.drop('critical_temp',axis=1)
y = df_scaled['critical_temp']

In [None]:
models = {'LR':LinearRegression(),'KNN':neighbors.KNeighborsRegressor(),
         'RF':RandomForestRegressor()}

In [None]:
def model_performance(X,y,i):
    keys=[]
    mean_squared_errors = []
    R2_scores = []
    features = X.columns
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
    
    for k,v in models.items():
        model = v
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        R2_scores.append(r2_score(y_test,pred))
        mean_squared_errors.append(mean_squared_error(y_test,pred))
        keys.append(k)
    table = pd.DataFrame({'model':keys, 'RMSE':mean_squared_errors,'R2 score':R2_scores})
    table['RMSE'] = table['RMSE'].apply(lambda x: np.sqrt(x))
    return table

In [None]:
model_performance(X,y,42)

In [None]:
RF = RandomForestRegressor()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
RF.fit(X_train,y_train)

In [None]:
y_pred_test = pd.Series(RF.predict(X_test))
y_pred_train = pd.Series(RF.predict(X_train))

rmse = round(np.sqrt(mean_squared_error(y_test,y_pred_test)),4)
r2 = round(r2_score(y_pred_test,y_test),4)

In [None]:
from scipy import stats
y_pred_test = y_pred_test.dropna()
y_pred_train = y_pred_train.dropna()
y_test = y_test.dropna()
y_train = y_train.dropna()

slope, intercept, r_value, p_value, std_err = stats.linregress(y_test,y_pred_test)
line = slope*y_test+intercept
plt.figure(figsize=(8,8))
plt.plot(y_test,line,color='red')
plt.scatter(y_test,y_pred_test)
plt.ylabel('Predicted Temp',fontsize=20)
plt.xlabel('Actual Temp',fontsize=20)
text = r'$\pm'+str(rmse)+'$'+'\n r2 score: ' +str(r2)
plt.text(-0.1,1.7,'RMSE: '+text,fontsize=15)
plt.title('Actual Temperature vs Predicted Temp',fontsize=20)
plt.show()

# Clusters

In [None]:
lista = ['critical_temp', 'wtd_std_ThermalConductivity', 'range_atomic_radius', 'wtd_mean_Valence', 'wtd_entropy_atomic_mass' ]

In [None]:
data = df_scaled[lista]

Una clusterizzazione più naturale sarebbe dividere i campioni in "lowT" e "highT", tuttavia la confusione nella distribuzione è tale per cui una separazione dei campioni ad alta T avviene creando 3 cluster. Il primo contiene i compioni a bassissima T, il secondo è un cluster di transizione e il terzo contiene i campioni ad alta T.

In [None]:
kmeans3 = KMeans(3)
kmeans3.fit(data)
clusters3 = kmeans3.fit_predict(data)
data['cluster3']=clusters3
data['critical_temp'].hist(by=data['cluster3'], bins= 50)

In [None]:
list1 = []
for i in [0,1,2]:
    a=data['critical_temp'][(data["cluster3"] == i)].mean()
    c=data['critical_temp'][(data["cluster3"] == i)].median()
    list1.append(a)
    list1.append(c)
    
list1

In [None]:
data_lowT = data[(data["cluster3"] == 0)]
data_middleT = data[(data["cluster3"] == 2)]
data_highT = data[(data["cluster3"] == 1)]

In [None]:
X_1 = data_lowT.drop('critical_temp',axis=1)
y_1 = data_lowT['critical_temp']
model_performance(X_1,y_1,42)

In [None]:
X_2 = data_middleT.drop('critical_temp',axis=1)
y_2 = data_middleT['critical_temp']
model_performance(X_2,y_2,42)

In [None]:
X_3 = data_highT.drop('critical_temp',axis=1)
y_3 = data_highT['critical_temp']
model_performance(X_3,y_3,42)

La clusterizzazione migliora significativamente le performance nei primi due cluster, ma la peggiora nel terzo. Questo perché i primi due cluster sono più concentrati attorno ad un valore, mentre il terzo cluster ha una distribuzione molto larga. A causa della riduzione significativa del numero di variabili predittive, il valore r2 subisce un drastico abbassamento in tutti i cluster.