In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = "/content/data_for_predictions.csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
columns_to_drop = ["id", "date_end", "date_modif_prod", "date_renewal"]
df.drop(columns=columns_to_drop, inplace=True)

# Convert date columns to datetime format
date_columns = ["date_activ"]
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Extract date-based features
df["year_activ"] = df["date_activ"].dt.year
df["month_activ"] = df["date_activ"].dt.month
df["day_activ"] = df["date_activ"].dt.day

# Create energy usage ratio
df["energy_usage_ratio"] = df["cons_12m"] / (df["forecast_cons_12m"] + 1e-9)  # Avoid division by zero

# Create average price variation feature
price_columns = ["var_year_price_off_peak", "var_6m_price_peak", "var_6m_price_off_peak"]
df["avg_price_variation"] = df[price_columns].mean(axis=1)

# Prepare data for modeling
target_column = "churn"
X = df.drop(columns=[target_column])
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a basic model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")
print(report)

# Save the cleaned dataset
cleaned_file_path = "/mnt/data/cleaned_data.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to: {cleaned_file_path}")

KeyError: "['date_end', 'date_modif_prod', 'date_renewal'] not found in axis"