In [None]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn libraries for preprocessing, model building, and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

# For handling imbalanced data
from imblearn.over_sampling import SMOTE

# For saving the model
import pickle


In [None]:
# Load the data
train_data = pd.read_excel('/mnt/data/train.xlsx')
test_data = pd.read_excel('/mnt/data/test.xlsx')

# Display first few rows of the dataset
print("Train Data Overview:")
print(train_data.head())

print("\nTest Data Overview:")
print(test_data.head())


In [None]:
# Check for missing values
print("Missing values in train dataset:")
print(train_data.isnull().sum())

# Summary statistics of the training data
print("\nSummary statistics of train dataset:")
print(train_data.describe())

# Visualize the target distribution to check for imbalance
plt.figure(figsize=(6,4))
sns.countplot(x='target_column', data=train_data)  # Replace 'target_column' with the actual target column
plt.title('Target Class Distribution')
plt.show()


In [None]:

# Handle missing values if any
train_data.fillna(method='ffill', inplace=True)
test_data.fillna(method='ffill', inplace=True)

# Convert any columns that should be datetime
train_data['date_column'] = pd.to_datetime(train_data['date_column'])  # Replace 'date_column' accordingly

# Remove any irrelevant columns
columns_to_drop = ['irrelevant_column1', 'irrelevant_column2']  # Replace with actual irrelevant columns
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)


In [None]:
# Separate input features and target
X = train_data.drop('target_column', axis=1)  # Replace 'target_column' with the actual target column
y = train_data['target_column']

# Using SMOTE for balancing the data
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# Check if the data is now balanced
plt.figure(figsize=(6,4))
sns.countplot(x=y_res)
plt.title('Balanced Target Class Distribution')
plt.show()


In [None]:
# Feature Engineering (if any new features are created, add them here)
# For example, creating new features based on dates, interactions, etc.

# Standardize the features if required
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)

# Save the scaler for future use (during model deployment)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [None]:
# Split the data into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_res_scaled, y_res, test_size=0.2, random_state=42)


In [None]:
# Initialize a classifier (Random Forest and Logistic Regression in this case)
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# Train models and evaluate them
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    
    # Make predictions on validation data
    y_pred = model.predict(X_valid)
    
    # Evaluate the model
    print(f"\n{model_name} Evaluation:")
    print(f"Accuracy: {accuracy_score(y_valid, y_pred):.2f}")
    print(classification_report(y_valid, y_pred))
    print(confusion_matrix(y_valid, y_pred))


In [None]:
# Hyperparameter tuning using GridSearchCV for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best parameters: {grid_search.best_params_}")


In [None]:
# testing on unseen data
# Prepare test data
test_data_scaled = scaler.transform(test_data.drop('target_column', axis=1))  # Drop the same irrelevant columns
best_model = grid_search.best_estimator_

# Make predictions
test_predictions = best_model.predict(test_data_scaled)

# Add predictions to the test data
test_data['Prediction'] = test_predictions

# Save the final output to a new CSV
test_data.to_csv('test_with_predictions.csv', index=False)


In [None]:
# Save the model
with open('propensity_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Model saved for deployment.")
