In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1 Load Train & Test Data
df_train = pd.read_csv("train.csv")  # Load training dataset
df_test = pd.read_csv("test.csv")  # Load test dataset
company_id= df_test["company_id"]

# Ensure Target Column Exists
target_col = "is_hot_lead"
if target_col not in df_train.columns:
    raise ValueError(f"Target column '{target_col}' not found in training data!")

#  Convert Numeric Columns to Float Safely
for col in df_train.columns:
    if df_train[col].dtype == "object":  # If column is detected as string
        try:
            df_train[col] = df_train[col].astype(float)  # Convert to float if possible
            df_test[col] = df_test[col].astype(float)
        except ValueError:
            pass  # If it fails, keep it as a categorical feature

# Remove Infinity & Large Values
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Cap extremely large values (above 1 million)
numeric_cols = df_train.select_dtypes(include=[np.number]).columns  # Get numeric columns
df_train[numeric_cols] = df_train[numeric_cols].clip(-1e6, 1e6)  # Cap extreme values
df_test[numeric_cols] = df_test[numeric_cols].clip(-1e6, 1e6)

#  Handle Missing Values (Fill with Mean for Numeric, Mode for Categorical)
for col in df_train.columns:
    if df_train[col].dtype == "object":
        df_train[col].fillna(df_train[col].mode()[0], inplace=True)
        df_test[col].fillna(df_test[col].mode()[0], inplace=True)
    else:
        df_train[col].fillna(df_train[col].mean(), inplace=True)
        df_test[col].fillna(df_test[col].mean(), inplace=True)

# Encode Categorical Features (Ensure Consistent Encoding for Train & Test)
label_encoders = {}
categorical_cols = df_train.select_dtypes(include=['object']).columns

for col in categorical_cols:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    label_encoders[col] = le  # Store encoder for test set

# Apply the same encoding to the test set (handle unseen categories safely)
for col, le in label_encoders.items():
    if col in df_test.columns:
        df_test[col] = df_test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# 7 Ensure Test Data Has Same Features as Train
df_test = df_test[df_train.drop(columns=[target_col]).columns]

# Convert Data to Float (Avoid Conversion Errors)
X = df_train.drop(columns=[target_col]).astype(np.float32)
y = df_train[target_col].astype(np.int32)

# Split Train & Validation Set for Evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#  Evaluate on Validation Set
y_val_pred = model.predict(X_val)

f1 = f1_score(y_val, y_val_pred)  # Primary metric
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)

print(f"✅ Model Evaluation:")
print(f"F1 Score: {f1:.4f}")  # Primary metric
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

#  Make Predictions on Test Set
predictions = model.predict(df_test)

#  Save Predictions to CSV
output = pd.DataFrame({'Company_id':company_id, target_col: predictions})
output.to_csv("submission.csv", index=False)

print("✅ Model training complete! Predictions saved to 'submission.csv'.")
