In [25]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, auc
import ast

In [27]:
# Load data
train_data = pd.read_csv('data/raw/train.csv')
test_data = pd.read_csv('data/raw/test.csv')

# Ensure connections are lists
train_data['Connections'] = train_data['Connections'].apply(ast.literal_eval)
test_data['Connections'] = test_data['Connections'].apply(ast.literal_eval)

In [28]:
# Create a mapping from ID to index
all_ids = pd.concat([train_data['ID'], test_data['ID']]).unique()
id_to_index = {id: idx for idx, id in enumerate(all_ids)}

# Convert IDs to indices
train_data['Index'] = train_data['ID'].map(id_to_index)
test_data['Index'] = test_data['ID'].map(id_to_index)

# Convert connections to index-based
train_data['Connections'] = train_data['Connections'].apply(lambda x: [id_to_index[neighbor] for neighbor in x])
test_data['Connections'] = test_data['Connections'].apply(lambda x: [id_to_index[neighbor] for neighbor in x])

In [33]:
# Create a sparse adjacency matrix for the entire dataset
row_indices = []
col_indices = []
for idx, row in pd.concat([train_data, test_data]).iterrows():
    for neighbor in row['Connections']:
        row_indices.append(id_to_index[row['ID']])
        col_indices.append(neighbor)

In [34]:
adjacency_matrix = csr_matrix((np.ones(len(row_indices)), (row_indices, col_indices)), shape=(len(all_ids), len(all_ids)))

In [35]:
# Function to compute neighbor features for a given subset of data
def compute_neighbor_features(data_subset, adjacency_matrix, infected_array, age_array):
    indices = data_subset['Index'].values
    infected_neighbors = adjacency_matrix[indices].dot(infected_array)
    avg_age_neighbors = adjacency_matrix[indices].dot(age_array) / np.maximum(adjacency_matrix[indices].sum(axis=1).A1, 1)
    return infected_neighbors, avg_age_neighbors

In [40]:
# Infected and age arrays for the entire dataset
infected_array = np.zeros(len(all_ids))
infected_array[train_data['Index']] = train_data['Infected']

In [37]:
# Compute neighbor features for train data
train_data['infected_neighbors'], train_data['avg_age_neighbors'] = compute_neighbor_features(train_data, adjacency_matrix, infected_array, age_array)

In [38]:
# Normalize features
scaler = StandardScaler()
train_data[['Age', 'Constitution', 'infected_neighbors', 'avg_age_neighbors']] = scaler.fit_transform(train_data[['Age', 'Constitution', 'infected_neighbors', 'avg_age_neighbors']])

In [None]:
print(train_data.head())

In [41]:
# Check if 'Behavior' column exists before encoding
if 'Behavior' in train_data.columns:
    train_data['Behavior'] = train_data['Behavior'].map({0: 0, 1: 1, 2: 2})

In [42]:
# Split data
X = train_data[['Age', 'Constitution', 'Behavior', 'infected_neighbors', 'avg_age_neighbors']]
y = train_data['Infected']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

KeyError: "['Behavior'] not in index"

In [2]:
# Train model
model = LogisticRegression()
model.fit(X_train, y_train, verbose=1)

KeyboardInterrupt: 

In [None]:
# Evaluate model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# ROC Curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()