# Model Training and Evaluation for Perovskite Materials

In [None]:

# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
    

In [None]:

# Step 2: Load and Preprocess Data
file_path = "../data/HighthroughputDFTcalculations.csv"
df = pd.read_csv(file_path)

# (Insert preprocessing steps, e.g., encoding, cleaning, dropping missing values)

# Split features and labels
X = df.drop("LowestDist", axis=1)
y = df["LowestDist"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    

In [None]:

# Step 3: Train Logistic Regression Model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Logistic Regression Performance
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))
    

In [None]:

# Step 4: Visualize Logistic Regression Results
cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.show()
    

In [None]:

# Step 5: Train Random Forest Model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
    