In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.utils import resample
import time

# Record start time
start_time = time.time()

# Step 1: Load the dataset
print("Loading dataset...")
data = pd.read_csv('train.csv')
print("Dataset loaded successfully.")

# Step 2: Check the distribution of the target variable
print("Target distribution:")
print(data['target'].value_counts())

# Step 3: Extract samples where target is 1 and 0
print("Extracting positive and negative samples...")
positive_samples = data[data['target'] == 1]
negative_samples = data[data['target'] == 0]
print("Samples extracted successfully.")

# Step 4: Balance the dataset by upsampling the minority class
print("Balancing the dataset by upsampling the minority class...")
minority_class = positive_samples
majority_class = negative_samples.sample(n=len(positive_samples), replace=True, random_state=42)
balanced_data = pd.concat([majority_class, minority_class])
print("Dataset balanced successfully.")

# Step 5: Split the dataset into features (X) and target (y)
print("Splitting the dataset into features and target variable...")
X = balanced_data.drop(['id', 'target'], axis=1)
y = balanced_data['target']
print("Dataset split successfully.")

# Step 6: Split the data into training and testing sets
print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split successfully.")

# Step 7: Check the distribution of y_train
print("Distribution of y_train:")
print(y_train.value_counts())

# Step 8: Train the classification model
print("Training the RandomForest model...")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model trained successfully.")

# Step 9: Predict probabilities for unknown users
print("Predicting probabilities for unknown users...")
unknown_users = data[data['target'] == 0].drop(['id', 'target'], axis=1)
predicted_probabilities = model.predict_proba(unknown_users)[:, 1]
print("Probabilities predicted successfully.")

# Step 10: Determine potential 5G users with a threshold
print("Determining potential 5G users with threshold...")
threshold = 0.6
predicted_labels = (predicted_probabilities > threshold).astype(int)
print("Potential 5G users determined successfully.")

# Step 11: Save the prediction results to an Excel file
print("Saving prediction results to Excel file...")
results = pd.DataFrame({
    'id': data[data['target'] == 0]['id'],
    'predicted_target': predicted_labels
})
results.to_excel('predicted_results.xlsx', index=False)
print("Prediction results saved to predicted_results.xlsx")

# Step 12: Evaluate the model performance on the test set
print("Evaluating model performance on the test set...")
y_test_pred_prob = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_pred_prob > threshold).astype(int)
print("Classification report for test set:\n", classification_report(y_test, y_test_pred))
print("AUC score for test set:", roc_auc_score(y_test, y_test_pred_prob))

# Record end time and calculate total run time
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total run time: {elapsed_time:.2f} seconds")


Loading dataset...
Dataset loaded successfully.
Target distribution:
target
0.0    789400
1.0     10600
Name: count, dtype: int64
Extracting positive and negative samples...
Samples extracted successfully.
Balancing the dataset by upsampling the minority class...
Dataset balanced successfully.
Splitting the dataset into features and target variable...
Dataset split successfully.
Splitting data into training and test sets...
Data split successfully.
Distribution of y_train:
target
1.0    8498
0.0    8462
Name: count, dtype: int64
Training the RandomForest model...
Model trained successfully.
Predicting probabilities for unknown users...
Probabilities predicted successfully.
Determining potential 5G users with threshold...
Potential 5G users determined successfully.
Saving prediction results to Excel file...
Prediction results saved to predicted_results.xlsx
Evaluating model performance on the test set...
Classification report for test set:
               precision    recall  f1-score   