# Linear Regression Model

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, root_mean_squared_error, r2_score, confusion_matrix, classification_report



# Read csv Data
df = pd.read_csv("pairs_with_features.csv")

# Select features and target
features_columns = ["abs_len_diff", "align_len", "gap_count", "gap_fraction"]
X = df[features_columns]
y= df["label"]

# Train/test the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
mse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression Results")
print("MSE:", mse)
print("R^2:", r2)
print("\nSample predictions:", y_pred[:10])

# Classification Report and Confusion Matrix
binary_y_pred = (y_pred >= 0.5).astype(int)
cm = confusion_matrix(y_test, binary_y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)

print("\nClassification Report:")
print(classification_report(y_test, binary_y_pred))
plt.title('Confusion Matrix for Linear Regression')
plt.show()